tests/clara-gower.R

## Originally inspired by  Kasper Fischer-Rasmussen 's  clara_gower.html  [html from Rmd]

library(cluster)
packageDescription("cluster")

## carefully getting  150 + 200 + 150 = 500 obs. from the 3  xclara clusters :
str(dd <- xclara[c(1:150, 1001:1200, 2101:2250), ])
dim(dd) # 500 2

set.seed(47)
cl_manhat <- clara(dd, 3, metric = "manhattan", rngR=TRUE, pamLike=TRUE, samples = 500)
cl_gower  <- clara(dd, 3, metric = "gower",     rngR=TRUE, pamLike=TRUE, samples = 500)

table(cl_manhat$cluster,
      cl_gower $cluster)

stopifnot(exprs = {
    ## Apart from [188], they are the same
    ##    usually even *including* [188], but not always ???? {FIXME ??? Random? even we use rngR?}
    cl_manhat$cluster[-188] == cl_gower $cluster[-188]
    identical(rle(unname(cl_manhat$cluster)),
              structure(class = "rle",
                        list(lengths = c(29L, 1L, 120L, 80L, 1L, 119L, 150L),
                             values  = c( 1L, 2L,   1L,  2L, 1L,   2L,   3L))))
})
## ==> no distinction between the clusters wrt Manhattan vs. Gower's distance.


## Using {cluster}'s built in tools to compute Gower's distance.

cl_gower_full <- clara(dd, k = 3, metric = "gower", rngR = TRUE, pamLike = TRUE, samples = 500, sampsize = nrow(dd))
dist_cl_full <- as.matrix(cl_gower_full$diss)
i_full <- rownames(dist_cl_full)
d_full <- data.frame(CLARA = as.vector(cl_gower_full$diss),
                     DAISY = as.vector(daisy(dd[i_full, ], metric = "gower")))

## MM: instead of all this, just
all.equal(d_full$CLARA,
          d_full$DAISY, tol=0) # "Mean relative difference: 2.17e-16"
## ... but sometimes *VERY* different (relative diff.   0.5xxx)
if(FALSE)
stopifnot( all.equal(d_full$CLARA,
                     d_full$DAISY, tol = 1e-15) ) ## equal up to  15 digits!

## We can see that the distance measurements are exactly identical when the
## whole data is used in the clustering. This is because the Gower distance
## scales the distances measurements with the range of each feature. Due to
## the subsampling, approximate ranges are calculated based on each
## subsample explaining the deviations.


## MM: compare -- with pam():
dGow <- daisy(dd, metric="gower")
cl_full <- clara(dd, k = 3, metric = "gower", rngR = TRUE, pamLike = TRUE, samples = 1, sampsize = nrow(dd))

all.equal(c(dGow) , c(cl_full$diss), tol=0) # "Mean relative difference: 2.171402e-16"

pam_3 <- pam(dGow, k = 3, variant = "faster")
## FIXME !! -- bug !?
all.equal(pam_3  $ clustering, # we would want *identical* -- bug ??
          cl_full$ clustering)
all.equal(c(dGow) , c(cl_full$diss), tol = 1e-15)
if(FALSE) ## FIXME
stopifnot(exprs = {
    identical(pam_3  $ clustering,
              cl_full$ clustering)
    all.equal(c(dGow) , c(cl_full$diss), tol = 1e-15)
})

Try the cluster package in your browser

Any scripts or data that you put into this service are public.

cluster documentation built on Nov. 28, 2023, 1:07 a.m.