CrossTabCluster: Tools For Clsutering Cross Tables

Documented in hclust_table kmeans_table

sort_leaves <- function(merge, node) {
  if (merge[node, 1] < 0 & merge[node, 2] < 0) {
    c(merge[node, 1], merge[node, 2])
  } else if (merge[node, 1] < 0 & merge[node, 2] > 0) {
    c(sort_leaves(merge, merge[node, 2]), merge[node, 1])
  } else if (merge[node, 1] > 0 & merge[node, 2] < 0) {
    c(sort_leaves(merge, merge[node, 1]), merge[node, 2])
  } else if (merge[node, 1] > merge[node, 2]) {
    c(sort_leaves(merge, merge[node, 2]), sort_leaves(merge, merge[node, 1]))
  } else {
    c(sort_leaves(merge, merge[node, 1]), sort_leaves(merge, merge[node, 2]))
  }
}

lance_williams_update <- function(distances, masses, merge){
  inter_merge <- distances[merge[1], merge[2]]
  next_label <- max(0L, max(as.integer(rownames(distances)))) + 1L

  distances <- matrix(distances[, -merge], nrow = nrow(distances))

  old_distances <- matrix(distances[merge, ], nrow = 2)
  old_masses <- masses[merge]
  cluster_mass <- sum(old_masses)

  distances <- matrix(distances[-merge, ], ncol = ncol(distances))
  masses <- masses[-merge]

  new_distances <- sapply(seq_len(ncol(distances)), function(ind) {
    (masses[ind] + old_masses[1])/(masses[ind] + cluster_mass)*
      old_distances[1, ind] +
      (masses[ind] + old_masses[2])/(masses[ind] + cluster_mass)*
      old_distances[2, ind] -
      masses[ind]/(masses[ind] + cluster_mass)*
      inter_merge
  })

  new_distances <- cbind(rbind(distances, new_distances), c(new_distances, 0))
  colnames(new_distances)[ncol(new_distances)] <- next_label
  rownames(new_distances) <- colnames(new_distances)

  new_masses <- c(masses, cluster_mass)
  names(new_masses) <- colnames(new_distances)

  list(
    distances = new_distances,
    masses = new_masses
  )
}

#' Hierarchical clustering of contingency tables
#'
#' Applies Ward's agglomerative clustering to the rows or columns of the contingecy table.
#'
#' @inheritParams chisq_distance
#'
#' @return object of class hclust.
#' @export
#'
#' @examples
#' hclust_table(ksarakil)
#' hclust_table(israeli_survey, dimension = 2)
hclust_table <- function(table, dimension = 1) {
  if (dimension != 1 & dimension != 2) {
    stop("Wrong value for 'dimension' argument. Can be either 1 for rows or 2 for columns")
  }

  k <- dim(table)[dimension]

  if (k < 2) {
    stop("Must have two or more profiles to cluster")
  }

  #output prealocation
  merge <- matrix(nrow = k - 1, ncol = 2)
  height <- numeric(k - 1)

  #init
  distances <- cluster_distance(table, as.list(seq_len(k)), dimension)
  masses <- apply(table, dimension, sum)

  colnames(distances) <- rownames(distances) <- names(masses) <- -seq_len(k)

  for (i in seq_len(k-1)) {
    to_cluster <- as.integer(
      which(distances == min(distances[upper.tri(distances)]),
            arr.ind = TRUE)[1, ]
    )

    merge[i, ] <- as.integer(rownames(distances)[to_cluster])
    height[i] <- distances[to_cluster[1], to_cluster[2]]

    updates <- lance_williams_update(distances, masses, to_cluster)

    distances <- updates$distances

    masses <- updates$masses
  }

  structure(
    list(
      merge = merge,
      height = height,
      order = -sort_leaves(merge, nrow(merge)),
      labels = attr(table, "dimnames")[[dimension]],
      method = "ward.D2",
      call = match.call(),
      dist.method = "chisq"),
    class = "hclust")
}

simplified_ca <- function(table, dimension = 1) {
  if (dimension != 1 & dimension != 2) {
    stop("Wrong dimension specification. Can be either 1 for rows or 2 for columns")
  }

  if (dimension == 2) {
    table <- t(table)
  }

  N <- sum(table)

  row_masses <- rowSums(table)

  cc <- colSums(table)/N
  rr <- row_masses/N

  profiles <- table/row_masses
  decomposition <- svd(diag(sqrt(rr)) %*% sweep(profiles, 2, cc) %*% diag(sqrt(1/cc)))

  d <- decomposition$d
  n <- diag(1/sqrt(rr)) %*% decomposition$u
  m <- diag(sqrt(cc)) %*% decomposition$v
  f <- n %*% diag(d)
  g <- diag(d) %*% t(m)

  list(f = f, n = n, g = g, n = n, m = m, d = d)
}

#' K-means on correspondence analysis
#'
#' Performs weighted k-means clustering on k-1 in fir k-1 principal coordinates (or all of them),
#' obtained from correspondence analysis of contingency table.
#'
#' @inheritParams chisq_distance
#' @param k integer. number of clusters
#' @param ... arguments passed to kmeans.
#'
#' @return object of class k-means with additional information about clustewr membership of items
#' classified in the contingecy table and about sum of squares in low-dimensional space used in clustering.
#' @export
#'
#' @examples
#' kmeans_table(ksarakil, 1, 5)
#' kmeans_table(israeli_survey, 2, 2)
kmeans_table <- function(table, dimension = 1, k, ...) {
  args <- list(...)

  if ("x" %in% names(args)) {
    stop("Argument 'x' cannot be used as data is passed by kmeans_table itself")
  }

  if ("centers" %in% names(args)) {
    stop("Argument 'centers'  cannot be used as it is passed by kmeans_table itself")
  }

  if (dimension == 2) {
    table <- t(table)
  }

  row_masses <- rowSums(table)
  indices <- sort(rep(seq_len(nrow(table)), row_masses))
  coordinates <- simplified_ca(table, 1)$f
  coordinates <- coordinates[indices, 1:min(c(ncol(coordinates) - 1, k-1))]

  res <- do.call(stats::kmeans, c(list(x = coordinates, centers = k), args))

  res$cluster.item <- res$cluster
  res$size.item <- res$size

  res$cluster <- as.integer(apply(table(indices, res$cluster), 1, which.max))
  res$size <- as.integer(table(res$cluster))

  res$totss.reduced <- res$totss
  res$withinss.reduced <- res$withinss
  res$betweenss.reduced <- res$betweenss

  res$withinss <- do.call(c, row_within_ss(table, clusters2list(res$cluster)))
  res$totss <- stats::chisq.test(table, simulate.p.value = TRUE, B = 1)$statistic
  res$betweenss <- res$totss - sum(res$withinss)

  res
}