R/CLAG_compareclust.R
In CLAG: An unsupervised non hierarchical clustering algorithm handling biological data

# ===== CLUSTERING COMPARISON ======

# Count how many elements are differently clustered
# considering the cluster mapping assoc.
# assoc[i] = j means cluser i in cl1 is cluster j in cl2
mapClusteringsTest <- function(cl1, cl2, assoc) {
  stopifnot(length(cl1) == length(cl2))
  bad <- 0
  diffclust <- rep(FALSE, length(cl1))
  for (i in 1:length(cl1)) {
    if (cl1[[i]] == 0) {
      # i is unclusted in cl1
      if (cl2[[i]] != 0) {
        diffclust[[i]] <- TRUE
        bad <- bad + 1
      }
    } else {
      # find associated cluster
      if (cl2[[i]] == 0 || assoc[[cl1[[i]]]] != cl2[[i]]) {
        diffclust[[i]] <- TRUE
        bad <- bad + 1
      }
    }
  }
  return(list(ndiff=bad, diffclust=which(diffclust)))
}


# Hypothesis: cl1 and cl2 have non zeros (only clustered elements)
mapClusteringsAux <- function(cl1, cl2, verbose) {
  stopifnot(length(cl1) == length(cl2))
  stopifnot(all(cl1 > 0))
  stopifnot(all(cl2 > 0))
  # number of clusters in cl1 and cl2
  ncl1 <- max(cl1)
  ncl2 <- max(cl2)
  bestForNow <- Inf
  aux <- function(assoc, k, cost) {
    if (k > length(assoc)) {
      if (verbose) {
        cat("assoc=", assoc, " bad=", cost, " bestForNow=", bestForNow, "\n")
      }
      return(list(bad=cost, assoc=assoc))
    } else {
      # Search the partner for cluster k
      best <- Inf
      bestassoc <- NA
      candidates <- unique(cl2[cl1 == k & cl2 != 0])
      candidates <- c(candidates[! (candidates %in% assoc[1:(k-1)])], 0)
      elements <- which(cl1 == k)
      for (j in candidates) {
        assoc[k] <- j
        # How many differently clustered elements does this choice generate?
        # special case: if j = 0 all will be different (hyp: no 0 in cl2)
        morebad <- sum(cl2[elements] != j)
        newcost <- cost + morebad
        # If cost already exceeds (or equals) lowest cost known,
        # it is usless to explore the branch (cut it).
        if (newcost < bestForNow && bestForNow > 0) {
          res <- aux(assoc, k+1, newcost)
          if (res$bad < best) {
            best <- res$bad
            bestassoc <- res$assoc
            if (res$bad < bestForNow) {
              bestForNow <<- res$bad
            }
          }
        }
      }
      return(list(bad=best, assoc=bestassoc))
    }
  }
  return(aux(rep(0L, ncl1), 1, 0))
}


# Given two clusterings, find the best association between clusters
# of cl1 with clusters of cl2
# (injective in both directions - but not necessarly surjective)
# such that the number of differently clustered elements is minimum.
# An element e is identicaly clustered if one of these conditions is true:
# 1. it belongs to a cluster i in cl1 
#    and j in cl2 and i is associated with j.
# 2. e is unclustered in both cl1 and cl2
# Differently clustered is the opposite.
# Conventions:
# 0 = unclustered
# integer >= 1: number of the cluster to which the element belongs
mapClusterings <- function(cl1, cl2, verbose=FALSE, use.solve.LSAP=NULL) {
  if ((!is.null(use.solve.LSAP) && use.solve.LSAP)
      || (is.null(use.solve.LSAP) && (exists("solve_LSAP") || ("clue" %in% installed.packages()[,1])))) {
    # Use solve_LSAP instead of our branch and bound algorithm
    # because solve_LSAP is much faster.
    return(mapClusteringsLSAP(cl1, cl2))
  } else {
    cl1 <- as.integer(cl1)
    cl2 <- as.integer(cl2)
    stopifnot(all(cl1 >= 0))
    stopifnot(all(cl2 >= 0))
    stopifnot(length(cl1) == length(cl2))
    # When an element is unclusted in one clustering only,
    # it will count as "different" whatever the cluster mapping is
    unavoidableCost <- sum(xor(cl1 == 0, cl2 == 0))
    # It is useless to try to map unclunstered elements, se remove then
    keep <- cl1 != 0 & cl2 != 0
    res <- mapClusteringsAux(cl1[keep], cl2[keep], verbose=verbose)
    ndiff <- res$bad + unavoidableCost
    
    # Check the score by an independant method
    test <- mapClusteringsTest(cl1, cl2, res$assoc)
    stopifnot(test$ndiff == ndiff)
    
    return(list(ndiff=ndiff, assoc=res$assoc, diffclust=test$diffclust))
  }
}


# Does the same job as mapClusterings, except that it uses
# solve_LSPA from clue package to solve the problem.
mapClusteringsLSAP <- function(cl1, cl2) {
  if (!exists("solve_LSAP")) {
    library(clue)
  }
  
  cl1 <- as.integer(cl1)
  cl2 <- as.integer(cl2)
  stopifnot(all(cl1 >= 0))
  stopifnot(all(cl2 >= 0))
  stopifnot(length(cl1) == length(cl2))
  
  ncl1 <- max(cl1)
  ncl2 <- max(cl2)
  N <- max(ncl1, ncl2)
  M <- matrix(0, nrow=N, ncol=N)
  for (i in 1:ncl1) {
    for (j in 1:ncl2) {
      M[i,j] <- sum(cl1 == i & cl2 == j)
    }
  }
  
  solution <- solve_LSAP(M, maximum=TRUE)
  solution <- solution[1:ncl1]
  solution[solution > ncl2] <- 0
  
  # Count how many elements are differently clustered
  testres <- mapClusteringsTest(cl1, cl2, solution)
  
  return(list(ndiff=testres$ndiff, assoc=solution, diffclust=testres$diffclust))
}


compareClusterings <- function(cl1, cl2, verbose=FALSE, use.solve.LSAP=NULL) {
  res <- mapClusterings(cl1, cl2, verbose=verbose, use.solve.LSAP=use.solve.LSAP)
  cat("Cluster mapping:", res$assoc, "\n")
  cl1a <- rep(NA, length(cl1))
  for (i in 1:length(cl1)) {
    if (cl1[[i]] == 0) {
      cl1a[[i]] <- 0
    } else {
      cl1a[[i]] <- res$assoc[[cl1[[i]]]]
    }
  }
  cat("CL1 original:", cl1, "\n")
  cat("CL1 renamed: ", cl1a, "\n")
  cat("CL2 original:", cl2, "\n")
  cat("Differently clustered elements:", res$ndiff, "\n")
  return(invisible(res))
}

Any scripts or data that you put into this service are public.

CLAG documentation built on May 2, 2019, 5:49 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

CLAG
An unsupervised non hierarchical clustering algorithm handling biological data

R/CLAG_compareclust.R
In CLAG: An unsupervised non hierarchical clustering algorithm handling biological data

Try the CLAG package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CLAG An unsupervised non hierarchical clustering algorithm handling biological data

R/CLAG_compareclust.R In CLAG: An unsupervised non hierarchical clustering algorithm handling biological data

Try the CLAG package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CLAG
An unsupervised non hierarchical clustering algorithm handling biological data

R/CLAG_compareclust.R
In CLAG: An unsupervised non hierarchical clustering algorithm handling biological data