tests/test_fastcluster.R

#  fastcluster: Fast hierarchical clustering routines for R and Python
#
#  Copyright:
#    * Until package version 1.1.23: © 2011 Daniel Müllner <http://danifold.net>
#    * All changes from version 1.1.24 on: © Google Inc. <http://google.com>
#
# Test script for the R interface

seed = as.integer(runif(1, 0, 1e9))
set.seed(seed)
cat(sprintf("Random seed: %d\n",seed))

print_seed <- function() {
  return(sprintf('
Please send a report to the author of the \'fastcluster\' package, Daniel Müllner.
For contact details, see <http://danifold.net>. To make the error
reproducible, you must include the following number (the random seed value) in
your error report: %d.\n\n', seed))
}

hasWardD2 = getRversion() >= '3.1.0'

# Compare two dendrograms and check whether they are equal, except that
# ties may be resolved differently.
compare <- function(dg1, dg2) {
  h1 <- dg1$height
  h2 <- dg2$height
  # "height" vectors may have small numerical errors.
  rdiffs <- abs(h1-h2)/pmax(abs(h1),abs(h2))
  rdiffs = rdiffs[complete.cases(rdiffs)]
  rel_error <-  max(rdiffs)
  # We allow a relative error of 1e-13.
  if (rel_error>1e-13) {
    print(h1)
    print(h2)
    cat(sprintf('Height vectors differ! The maximum relative error is %e.\n', rel_error))
    return(FALSE)
  }
  # Filter the indices where consecutive merging distances are distinct.
  d = diff(dg1$height)
  b = (c(d,1)!=0 & c(1,d)!=0)
  #cat(sprintf("Percentage of indices where we can test: %g.\n",100.0*length(b[b])/length(b)))
  if (any(b)) {
    m1 = dg1$merge[b,]
    m2 = dg2$merge[b,]

    r = function(i) {
      if (i<0) {
        return(1)
      }
      else {
        return(b[i])
      }
    }

    f = sapply(m1,r)
    fm1 = m1*f
    fm2 = m2*f
    # The "merge" matrices must be identical whereever indices are not ambiguous
    # due to ties.
    if (!identical(fm1,fm2)) {
      cat('Merge matrices differ!\n')
      return(FALSE)
    }
    # Compare the "order" vectors only if all merging distances were distinct.
    if (all(b) && !identical(dg1$order,dg2$order)) {
      cat('Order vectors differ!\n')
      return(FALSE)
    }
  }
  return(TRUE)
}

# Generate uniformly distributed random data
generate.uniform <- function() {
  n = sample(10:1000,1)
  range_exp = runif(1,min=-10, max=10)
  cat(sprintf("Number of sample points: %d\n",n))
  cat(sprintf("Dissimilarity range: [0,%g]\n",10^range_exp))
  d = runif(n*(n-1)/2, min=0, max=10^range_exp)
  # Fake a compressed distance matrix
  attributes(d) <- NULL
  attr(d,"Size") <- n
  attr(d, "call") <- 'N/A'
  class(d) <- "dist"
  return(d)
}

# Generate normally distributed random data
generate.normal <- function() {
  n = sample(10:1000,1)
  dim = sample(2:20,1)

  cat (sprintf("Number of sample points: %d\n",n))
  cat (sprintf("Dimension: %d\n",dim))

  pcd = matrix(rnorm(n*dim), c(n,dim))
  d = dist(pcd)
  return(d)
}

# Test the clustering functions when a distance matrix is given.
test.dm <-  function(d) {
  d2 = d
  if (hasWardD2) {
    methods = c('single','complete','average','mcquitty','ward.D','ward.D2','centroid','median')
  }
  else {
    methods = c('single','complete','average','mcquitty','ward','centroid','median')
  }
  for (method in methods) {
    cat(paste('Method :', method, '\n'))
    dg_stats       = stats::hclust(d, method=method)
    if (method == 'ward') {
      method = 'ward.D'
    }
    dg_fastcluster = fastcluster::hclust(d, method=method)
    if (!identical(d,d2)) {
      cat('Input array was corrupted!\n')
      stop(print_seed())
    }
    if (!compare(dg_stats, dg_fastcluster)) {
      stop(print_seed())
    }
  }
  cat('Passed.\n')
}

# Test the clustering functions for vector input in Euclidean space.
test.vector <-  function() {
  # generate test data
  n = sample(10:1000,1)
  dim = sample(2:20,1)
  cat (sprintf("Number of sample points: %d\n",n))
  cat (sprintf("Dimension: %d\n",dim))

  range_exp = runif(1,min=-10, max=10)
  pcd = matrix(rnorm(n*dim, sd=10^range_exp), c(n,dim))
  pcd2 = pcd
  # test
  method='single'
  cat(paste('Method:', method, '\n'))
  for (metric in c('euclidean', 'maximum', 'manhattan', 'canberra', 'minkowski')) {
    cat(paste('    Metric:', metric, '\n'))
    if (metric=='minkowski') {
      p = runif(1, min=1.0, max=10.0)
      cat (sprintf("    p: %g\n",p));
      dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric, p=p)
      d = dist(pcd, method=metric, p=p)
    }
    else {
      dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric)
      d = dist(pcd, method=metric)
    }
    d2 = d
    dg_fastcluster_dist = fastcluster::hclust(d, method=method)
    if (!identical(d,d2) || !identical(pcd,pcd2)) {
      cat('Input array was corrupted!\n')
      stop(print_seed())
    }
    if (!compare(dg_fastcluster_dist, dg_fastcluster)) {
      stop(print_seed())
    }
  }
  for (method in c('ward','centroid','median') ) {
    cat(paste('Method:', method, '\n'))
    dg_fastcluster = fastcluster::hclust.vector(pcd, method=method)
    if (!identical(pcd,pcd2)) {
      cat('Input array was corrupted!\n')
      stop(print_seed())
    }
    d = dist(pcd)
    if(method == "ward" && hasWardD2) {
      method = "ward.D2"
    }
    else
    {
      # Workaround: fastcluster::hclust expects _squared_ euclidean distances.
      d = d^2
    }
    d2 = d
    dg_fastcluster_dist = fastcluster::hclust(d, method=method)
    if (!identical(d,d2)) {
      cat('Input array was corrupted!\n')
      stop(print_seed())
    }
    if(method != "ward.D2") {
      dg_fastcluster_dist$height = sqrt(dg_fastcluster_dist$height)
    }
    # The Euclidean methods may have small numerical errors due to squaring/
    # taking the root in the Euclidean distances.
    if (!compare(dg_fastcluster_dist, dg_fastcluster)) {
      stop(print_seed())
    }
  }
  cat('Passed.\n')
}

# Test the single linkage function with the "binary" metric
test.vector.binary <- function() {
  # generate test data
  cat (sprintf("Uniform sampling for the 'binary' metric:\n"))
  n = sample(10:400,1)
  dim = sample(n:(2*n),1)
  cat (sprintf("Number of sample points: %d\n",n))
  cat (sprintf("Dimension: %d\n",dim))
  pcd = matrix(sample(-1:2, n*dim, replace=T), c(n,dim))
  pcd2 = pcd
  # test
  method='single'
  metric='binary'
  cat(paste('Method:', method, '\n'))
  cat(paste('    Metric:', metric, '\n'))
  dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric)
  d = dist(pcd, method=metric)
  d2 = d
  dg_fastcluster_dist       = fastcluster::hclust(d, method=method)
  if (!identical(d,d2) || !identical(d,d2)) {
    cat('Input array was corrupted!\n')
    stop(print_seed())
  }
  if (!compare(dg_fastcluster_dist, dg_fastcluster)) {
    stop(print_seed())
  }
  cat('Passed.\n')
}


N = 15
for (i in (1:N)) {
  if (i%%2==1) {
    cat(sprintf('Random test %d of %d (uniform distribution of distances):\n',i,2*N))
    d = generate.uniform()
  }
  else {
    cat(sprintf('Random test %d of %d (Gaussian density):\n',i,2*N))
    d = generate.normal()
  }
  test.dm(d)
}
for (i in (N+1:N)) {
  cat(sprintf('Random test %d of %d (Gaussian density):\n',i,2*N))
  test.vector()
  test.vector.binary()
}

cat('Done.\n')

Try the fastcluster package in your browser

Any scripts or data that you put into this service are public.

fastcluster documentation built on May 25, 2021, 1:06 a.m.