R/CLAG_compareclust.R

# ===== CLUSTERING COMPARISON ======

# Count how many elements are differently clustered
# considering the cluster mapping assoc.
# assoc[i] = j means cluser i in cl1 is cluster j in cl2
mapClusteringsTest <- function(cl1, cl2, assoc) {
  stopifnot(length(cl1) == length(cl2))
  bad <- 0
  diffclust <- rep(FALSE, length(cl1))
  for (i in 1:length(cl1)) {
    if (cl1[[i]] == 0) {
      # i is unclusted in cl1
      if (cl2[[i]] != 0) {
        diffclust[[i]] <- TRUE
        bad <- bad + 1
      }
    } else {
      # find associated cluster
      if (cl2[[i]] == 0 || assoc[[cl1[[i]]]] != cl2[[i]]) {
        diffclust[[i]] <- TRUE
        bad <- bad + 1
      }
    }
  }
  return(list(ndiff=bad, diffclust=which(diffclust)))
}


# Hypothesis: cl1 and cl2 have non zeros (only clustered elements)
mapClusteringsAux <- function(cl1, cl2, verbose) {
  stopifnot(length(cl1) == length(cl2))
  stopifnot(all(cl1 > 0))
  stopifnot(all(cl2 > 0))
  # number of clusters in cl1 and cl2
  ncl1 <- max(cl1)
  ncl2 <- max(cl2)
  bestForNow <- Inf
  aux <- function(assoc, k, cost) {
    if (k > length(assoc)) {
      if (verbose) {
        cat("assoc=", assoc, " bad=", cost, " bestForNow=", bestForNow, "\n")
      }
      return(list(bad=cost, assoc=assoc))
    } else {
      # Search the partner for cluster k
      best <- Inf
      bestassoc <- NA
      candidates <- unique(cl2[cl1 == k & cl2 != 0])
      candidates <- c(candidates[! (candidates %in% assoc[1:(k-1)])], 0)
      elements <- which(cl1 == k)
      for (j in candidates) {
        assoc[k] <- j
        # How many differently clustered elements does this choice generate?
        # special case: if j = 0 all will be different (hyp: no 0 in cl2)
        morebad <- sum(cl2[elements] != j)
        newcost <- cost + morebad
        # If cost already exceeds (or equals) lowest cost known,
        # it is usless to explore the branch (cut it).
        if (newcost < bestForNow && bestForNow > 0) {
          res <- aux(assoc, k+1, newcost)
          if (res$bad < best) {
            best <- res$bad
            bestassoc <- res$assoc
            if (res$bad < bestForNow) {
              bestForNow <<- res$bad
            }
          }
        }
      }
      return(list(bad=best, assoc=bestassoc))
    }
  }
  return(aux(rep(0L, ncl1), 1, 0))
}


# Given two clusterings, find the best association between clusters
# of cl1 with clusters of cl2
# (injective in both directions - but not necessarly surjective)
# such that the number of differently clustered elements is minimum.
# An element e is identicaly clustered if one of these conditions is true:
# 1. it belongs to a cluster i in cl1 
#    and j in cl2 and i is associated with j.
# 2. e is unclustered in both cl1 and cl2
# Differently clustered is the opposite.
# Conventions:
# 0 = unclustered
# integer >= 1: number of the cluster to which the element belongs
mapClusterings <- function(cl1, cl2, verbose=FALSE, use.solve.LSAP=NULL) {
  if ((!is.null(use.solve.LSAP) && use.solve.LSAP)
      || (is.null(use.solve.LSAP) && (exists("solve_LSAP") || ("clue" %in% installed.packages()[,1])))) {
    # Use solve_LSAP instead of our branch and bound algorithm
    # because solve_LSAP is much faster.
    return(mapClusteringsLSAP(cl1, cl2))
  } else {
    cl1 <- as.integer(cl1)
    cl2 <- as.integer(cl2)
    stopifnot(all(cl1 >= 0))
    stopifnot(all(cl2 >= 0))
    stopifnot(length(cl1) == length(cl2))
    # When an element is unclusted in one clustering only,
    # it will count as "different" whatever the cluster mapping is
    unavoidableCost <- sum(xor(cl1 == 0, cl2 == 0))
    # It is useless to try to map unclunstered elements, se remove then
    keep <- cl1 != 0 & cl2 != 0
    res <- mapClusteringsAux(cl1[keep], cl2[keep], verbose=verbose)
    ndiff <- res$bad + unavoidableCost
    
    # Check the score by an independant method
    test <- mapClusteringsTest(cl1, cl2, res$assoc)
    stopifnot(test$ndiff == ndiff)
    
    return(list(ndiff=ndiff, assoc=res$assoc, diffclust=test$diffclust))
  }
}


# Does the same job as mapClusterings, except that it uses
# solve_LSPA from clue package to solve the problem.
mapClusteringsLSAP <- function(cl1, cl2) {
  if (!exists("solve_LSAP")) {
    library(clue)
  }
  
  cl1 <- as.integer(cl1)
  cl2 <- as.integer(cl2)
  stopifnot(all(cl1 >= 0))
  stopifnot(all(cl2 >= 0))
  stopifnot(length(cl1) == length(cl2))
  
  ncl1 <- max(cl1)
  ncl2 <- max(cl2)
  N <- max(ncl1, ncl2)
  M <- matrix(0, nrow=N, ncol=N)
  for (i in 1:ncl1) {
    for (j in 1:ncl2) {
      M[i,j] <- sum(cl1 == i & cl2 == j)
    }
  }
  
  solution <- solve_LSAP(M, maximum=TRUE)
  solution <- solution[1:ncl1]
  solution[solution > ncl2] <- 0
  
  # Count how many elements are differently clustered
  testres <- mapClusteringsTest(cl1, cl2, solution)
  
  return(list(ndiff=testres$ndiff, assoc=solution, diffclust=testres$diffclust))
}


compareClusterings <- function(cl1, cl2, verbose=FALSE, use.solve.LSAP=NULL) {
  res <- mapClusterings(cl1, cl2, verbose=verbose, use.solve.LSAP=use.solve.LSAP)
  cat("Cluster mapping:", res$assoc, "\n")
  cl1a <- rep(NA, length(cl1))
  for (i in 1:length(cl1)) {
    if (cl1[[i]] == 0) {
      cl1a[[i]] <- 0
    } else {
      cl1a[[i]] <- res$assoc[[cl1[[i]]]]
    }
  }
  cat("CL1 original:", cl1, "\n")
  cat("CL1 renamed: ", cl1a, "\n")
  cat("CL2 original:", cl2, "\n")
  cat("Differently clustered elements:", res$ndiff, "\n")
  return(invisible(res))
}

Try the CLAG package in your browser

Any scripts or data that you put into this service are public.

CLAG documentation built on May 2, 2019, 5:49 p.m.