Nothing
#' Measures for Comparing Clusterings
#'
#' Given two partitions or clusterings \eqn{C_1} and \eqn{C_2}, it returns community comparison scores
#' corresponding with a set of designated methods. Note that two label vectors should be
#' of same length having either numeric or factor type. Currently we have 3 classes of methods
#' depending on methodological philosophy behind each. See below for the taxonomy.
#'
#' @section Category 1. Counting Pairs:
#' \tabular{cl}{
#' TYPE \tab FULL NAME \cr
#' \code{'adjrand'} \tab \href{https://en.wikipedia.org/wiki/Rand_index}{Adjusted Rand index}.\cr
#' \code{'chisq'} \tab \href{https://en.wikipedia.org/wiki/Chi-squared_test}{Chi-Squared Coefficient}.\cr
#' \code{'fmi'} \tab \href{https://en.wikipedia.org/wiki/Fowlkes-Mallows_index}{Fowlkes-Mallows index}.\cr
#' \code{'jaccard'} \tab \href{https://en.wikipedia.org/wiki/Jaccard_index}{Jaccard index}.\cr
#' \code{'mirkin'} \tab Mirkin Metric, or Equivalence Mismatch Distance. \cr
#' \code{'overlap'} \tab \href{https://en.wikipedia.org/wiki/Overlap_coefficient}{Overlap Coefficient}, or Szymkiewicz-Simpson coefficient.\cr
#' \code{'pd'} \tab Partition Difference.\cr
#' \code{'rand'} \tab \href{https://en.wikipedia.org/wiki/Rand_index}{Rand Index}.\cr
#' \code{'sdc'} \tab \href{https://en.wikipedia.org/wiki/Sorensen-Dice_coefficient}{Sørensen–Dice Coefficient}.\cr
#' \code{'smc'} \tab \href{https://en.wikipedia.org/wiki/Simple_matching_coefficient}{Simple Matching Coefficient}.\cr
#' \code{'tanimoto'} \tab \href{https://en.wikipedia.org/wiki/Jaccard_index}{Tanimoto index}.\cr
#' \code{'tversky'} \tab \href{https://en.wikipedia.org/wiki/Tversky_index}{Tversky index}.\cr
#' \code{'wallace1'} \tab Wallace Criterion Type 1.\cr
#' \code{'wallace2'} \tab Wallace Criterion Type 2.
#' }
#' Note that Tanimoto Coefficient and Dice's coefficient are special cases with (alpha,beta) = (1,1) and (0.5,0.5), respectively.
#'
#' @section Category 2. Set Overlaps/Matching:
#' \tabular{cl}{
#' TYPE \tab FULL NAME \cr
#' \code{'f'} \tab F-Measure. \cr
#' \code{'mhm'} \tab Meila-Heckerman Measure. \cr
#' \code{'mmm'} \tab Maximum-Match Measure. \cr
#' \code{'vdm'} \tab Van Dongen Measure.
#' }
#'
#' @section Category 3. Information Theory:
#' \tabular{cl}{
#' TYPE \tab FULL NAME \cr
#' \code{'jent'} \tab \href{https://en.wikipedia.org/wiki/Joint_entropy}{Joint Entropy} \cr
#' \code{'mi'} \tab Mutual Information. \cr
#' \code{'nmi1'} \tab \href{https://en.wikipedia.org/wiki/Mutual_information}{Normalized Mutual Information} by Strehl and Ghosh. \cr
#' \code{'nmi2'} \tab \href{https://en.wikipedia.org/wiki/Mutual_information}{Normalized Mutual Information} by Fred and Jain. \cr
#' \code{'nmi3'} \tab Normalized Mutual Information by Danon et al. \cr
#' \code{'nvi'} \tab Normalized Variation of Information. \cr
#' \code{'vi'} \tab \href{https://en.wikipedia.org/wiki/Variation_of_information}{Variation of Information}.
#' }
#'
#' @param x,y vectors of clustering labels
#' @param types \code{"all"} for returning scores for every available measure.
#' Either a single score name or a vector of score names can be supplied. See the section
#' for the list of the methods for details.
#' @param tversky.param a list of parameters for Tversky index; \code{alpha} and \code{beta} for
#' weight parameters, and \code{sym}, a logical where \code{FALSE} stands for original method, \code{TRUE}
#' for a revised variant to symmetrize the score. Default (alpha,beta)=(1,1).
#'
#' @return a data frame with columns \code{types} and corresponding \code{scores}.
#'
#' @examples
#' ## example 1. compare two identical clusterings
#' x = sample(1:5,20,replace=TRUE) # label from 1 to 5, 10 elements
#' y = x # set two labels x and y equal
#' mclustcomp(x,y) # show all results
#'
#' ## example 2. selection of a few methods
#' z = sample(1:4,20,replace=TRUE) # generate a non-trivial clustering
#' cmethods = c("jaccard","tanimoto","rand") # select 3 methods
#' mclustcomp(x,z,types=cmethods) # test with the selected scores
#'
#' ## example 3. tversky.param
#' tparam = list() # create an empty list
#' tparam$alpha = 2
#' tparam$beta = 3
#' tparam$sym = TRUE
#' mclustcomp(x,z,types="tversky") # default set as Tanimoto case.
#' mclustcomp(x,z,types="tversky",tversky.param=tparam)
#'
#'
#' @references
#' \insertRef{strehl_cluster_2003}{mclustcomp}
#'
#' \insertRef{meila_comparing_2007}{mclustcomp}
#'
#' \insertRef{goos_comparing_2003}{mclustcomp}
#'
#' \insertRef{wagner_comparing_2007}{mclustcomp}
#'
#' \insertRef{albatineh_similarity_2006}{mclustcomp}
#'
#' \insertRef{mirkin_eleven_2001}{mclustcomp}
#'
#' \insertRef{rand_objective_1971}{mclustcomp}
#'
#' \insertRef{kuncheva_using_2004}{mclustcomp}
#'
#' \insertRef{fowlkes_method_1983}{mclustcomp}
#'
#' \insertRef{dongen_performance_2000}{mclustcomp}
#'
#' \insertRef{jaccard_distribution_1912}{mclustcomp}
#'
#' \insertRef{li_combining_2010}{mclustcomp}
#'
#' \insertRef{larsen_fast_1999}{mclustcomp}
#'
#' \insertRef{meila_experimental_2001}{mclustcomp}
#'
#' \insertRef{cover_elements_2006}{mclustcomp}
#'
#' \insertRef{ana_robust_2003}{mclustcomp}
#'
#' \insertRef{wallace_comment_1983}{mclustcomp}
#'
#' \insertRef{simpson_mammals_1943}{mclustcomp}
#'
#' \insertRef{dice_measures_1945}{mclustcomp}
#'
#' \insertRef{segaran_programming_2007}{mclustcomp}
#'
#' \insertRef{tversky_features_1977}{mclustcomp}
#'
#' \insertRef{danon_comparing_2005}{mclustcomp}
#'
#' \insertRef{lancichinetti_detecting_2009}{mclustcomp}
#'
#' @export
mclustcomp <- function(x,y,types="all",tversky.param=list()){
#------------------------------------------------------------------------
## PREPROCESSING
# 1. size argument
if ((!is.vector(x))||(!is.vector(y))){
stop("* mclustcomp : input 'x' and 'y' should both be a vector of class labels.")
}
n = length(x)
if (length(y)!=n){
stop("* mclustcomp : two vectors should be of same size.")
}
# 2. type conversion and unique vector
x = aux.conversion(x)
y = aux.conversion(y)
ux = unique(x)
uy = unique(y)
if (length(ux)==1){ warning("* mclustcomp : 'x' is a trivial clustering.") }
if (length(uy)==1){ warning("* mclustcomp : 'y' is a trivial clustering.") }
if (length(ux)==n){ warning("* mclustcomp : 'x' is the singleton clustering.") }
if (length(uy)==n){ warning("* mclustcomp : 'y' is the singleton clustering.") }
# 3. tversky parameter
listdot = as.list(environment())
if ("tversky.param" %in% names(listdot)){
tversky.param = listdot$tversky.param
} else {
tversky.param = list()
}
if (!("alpha" %in% names(tversky.param))){tversky.param$alpha = 1}
if (!("beta" %in% names(tversky.param))){tversky.param$beta = 1}
if (!("sym" %in% names(tversky.param))){tversky.param$sym = FALSE}
if (tversky.param$alpha < 0){stop("* mclustcomp : tversky.param$alpha should be >= 0.")}
if (tversky.param$beta < 0){stop("* mclustcomp : tversky.param$beta should be >= 0.")}
if (!is.logical(tversky.param$sym)){stop("* mclustcomp : tversky.param$sym should
be a logical variable; FALSE for original Tversky index, TRUE for a variant.")}
#------------------------------------------------------------------------
## PRELIMINARY COMPUTATIONS
## Prelim1 : CONFUSION MATRIX of size(length(ux),length(uy))
confmat = get.confusion(x,y,ux,uy)
## Prelim2 : size of each cluster
scx = get.commsize(x,ux)
scy = get.commsize(y,uy)
## Prelim3 : comembership matrix of (2,2)
pairmat = get.pair(x,y)
## Prelim4 : probability-related stuffs for Mutual Information
threps = min(1e-10,10*(.Machine$double.eps))
probs = get.probs(confmat,scx,scy,n,threps)
## Control : type.out
## Case 1 : Single Argument
## {"all" or single name}
## Case 2 : a vector of names; c("f","rand")
type_allnames = c("adjrand","chisq","f","fmi","jaccard","mhm","mirkin","mmm",
"mi","nmi1","nmi2","overlap","pd","rand","sdc","smc","tanimoto",
"tversky","vdm","vi","wallace1","wallace2","jent","nvi")
type_out = unique(types)
if ("all" %in% type_out){
type_test = sort(type_allnames)
} else {
type_test = sort(type_out) # this type test is the one we should generate again
}
#------------------------------------------------------------------------
## MAIN COMPUTATION
type_score = rep(0,length(type_test))
for (i in 1:length(type_test)){
type_score[i] = mclustsingle(n,x,y,ux,uy,scx,scy,confmat,pairmat,probs,threps,type_test[i],tversky.param)
}
#------------------------------------------------------------------------
## RETURN RESULTS
result = data.frame(types=type_test,scores=type_score)
return(result)
}
# COMPUTE :: single measure branching -------------------------------------
## Original Implementation of 19 methods
mclustsingle <- function(n,x,y,ux,uy,scx,scy,confmat,pairmat,probs,threps,type,tversky.param){
# Missing parameters for score08_mmm
nk = length(scx)
nl = length(scy)
# Sepearting probs for NMI and VIs
Ixy = probs$Ixy
Hx = probs$Hx
Hy = probs$Hy
Pxy = (confmat/n) # joint probability matrix + correction
Pxy[(Pxy<min(threps))] = min(threps)
# Tversky parameter
t.alpha = tversky.param$alpha
t.beta = tversky.param$beta
t.sym = tversky.param$sym
switch(type,
"chisq" = {output = single01_chisq(confmat,scx,scy,n)},
"rand" = {output = single02_rand(pairmat,n)},
"adjrand" = {output = single03_adjrand(confmat,scx,scy,n)},
"fmi" = {output = single04_fmi(pairmat)},
"mirkin" = {output = single05_mirkin(confmat,scx,scy)},
"jaccard" = {output = single06_jaccard(pairmat)},
"pd" = {output = single07_pd(pairmat)},
"f" = {output = single08_f(scx,scy,n)},
"mhm" = {output = single09_mhm(confmat,n)},
"mmm" = {output = single10_mmm(confmat,n,nk,nl)},
"vdm" = {output = single11_vdm(confmat,n)},
"mi" = {output = single12_mi(Ixy)},
"nmi1" = {output = single13_nmi1(Ixy,Hx,Hy,threps)},
"nmi2" = {output = single14_nmi2(Ixy,Hx,Hy,threps)},
"vi" = {output = single15_vi(Ixy,Hx,Hy,threps)},
"wallace1" = {output = single16_wallace1(pairmat,scx)},
"wallace2" = {output = single17_wallace2(pairmat,scy)},
"overlap" = {output = single18_overlap(pairmat)},
"sdc" = {output = single19_sdc(pairmat)},
"smc" = {output = single20_smc(pairmat)},
"tanimoto" = {output = single22_tversky(pairmat,1,1,FALSE)},
"tversky" = {output = single22_tversky(pairmat,t.alpha,t.beta,t.sym)},
"jent" = {output = single23_jent(Pxy)},
"nmi3" = {output = single24_nmi3(Hx,Hy,Pxy)},
"nvi" = {output = single25_nvi(Hx,Hy,Ixy,threps)}
)
# return output
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.