tests/testthat/test-hdbscan.R

test_that("HDBSCAN", {
  data("iris")

  ## minPts not given
  expect_error(hdbscan(iris))

  ## Expects numerical data; species is factor
  expect_error(dbscan(iris, minPts = 4))

  iris <- as.matrix(iris[,1:4])

  res <- hdbscan(iris, minPts = 4)
  expect_length(res$cluster, nrow(iris))

  ## expected result of table(res$cluster) is:
  expect_identical(table(res$cluster, dnn = NULL),
                    as.table(c("1" = 100L, "2" = 50L)))

  ## compare on moons data
  data("moons")
  res <- hdbscan(moons, minPts = 5)
  expect_length(res$cluster, nrow(moons))

  ## Check hierarchy matches dbscan* at every value
  check <- rep(FALSE, nrow(moons)-1)
  core_dist <- kNNdist(moons, k=5-1)

  ## cutree doesn't distinguish noise as 0, so we make a new method to do it manually
  cut_tree <- function(hcl, eps, core_dist){
    cuts <- unname(cutree(hcl, h=eps))
    cuts[which(core_dist > eps)] <- 0 # Use core distance to distinguish noise
    cuts
  }

  eps_values <- sort(res$hc$height, decreasing = TRUE)+.Machine$double.eps ## Machine eps for consistency between cuts
  for (i in seq_along(eps_values)) {
    cut_cl <- cut_tree(res$hc, eps_values[i], core_dist)
    dbscan_cl <- dbscan(moons, eps = eps_values[i], minPts = 5, borderPoints = FALSE) # DBSCAN* doesn't include border points

    ## Use run length encoding as an ID-independent way to check ordering
    check[i] <- (all.equal(rle(cut_cl)$lengths, rle(dbscan_cl$cluster)$lengths) == "TRUE")
  }

  expect_true(all(check))

  ## Expect generating extra trees doesn't fail
  res <- hdbscan(moons, minPts = 5, gen_hdbscan_tree = TRUE, gen_simplified_tree = TRUE)
  expect_s3_class(res, "hdbscan")

  ## Expect hdbscan tree matches stats:::as.dendrogram version of hclust object
  hc_dend <- as.dendrogram(res$hc)
  expect_s3_class(hc_dend, "dendrogram")
  expect_identical(hc_dend, res$hdbscan_tree)

  ## Expect hdbscan works with non-euclidean distances
  dist_moons <- dist(moons, method = "canberra")
  res <- hdbscan(dist_moons, minPts = 5)
  expect_s3_class(res, "hdbscan")
})

test_that("mrdist", {
  expect_identical(mrdist(cbind(1:10), 2),  mrdist(dist(cbind(1:10)), 2))
  expect_identical(mrdist(cbind(1:11), 3), mrdist(dist(cbind(1:11)), 3))
})

test_that("HDBSCAN(e)", {
  X <- data.frame(
   x = c(
    0.08, 0.46, 0.46, 2.95, 3.50, 1.49, 6.89, 6.87, 0.21, 0.15,
    0.15, 0.39, 0.80, 0.80, 0.37, 3.63, 0.35, 0.30, 0.64, 0.59, 1.20, 1.22,
    1.42, 0.95, 2.70, 6.36, 6.36, 6.36, 6.60, 0.04, 0.71, 0.57, 0.24, 0.24,
    0.04, 0.04, 1.35, 0.82, 1.04, 0.62, 0.26, 5.98, 1.67, 1.67, 0.48, 0.15,
    6.67, 6.67, 1.20, 0.21, 3.99, 0.12, 0.19, 0.15, 6.96, 0.26, 0.08, 0.30,
    1.04, 1.04, 1.04, 0.62, 0.04, 0.04, 0.04, 0.82, 0.82, 1.29, 1.35, 0.46,
    0.46, 0.04, 0.04, 5.98, 5.98, 6.87, 0.37, 6.47, 6.47, 6.47, 6.67, 0.30,
    1.49, 3.21, 3.21, 0.75, 0.75, 0.46, 0.46, 0.46, 0.46, 3.63, 0.39, 3.65,
    4.09, 4.01, 3.36, 1.43, 3.28, 5.94, 6.35, 6.87, 5.60, 5.99, 0.12, 0.00,
    0.32, 0.39, 0.00, 1.63, 1.36, 5.67, 5.60, 5.79, 1.10, 2.99, 0.39, 0.18
    ),
   y = c(
    7.41, 8.01, 8.01, 5.44, 7.11, 7.13, 1.83, 1.83, 8.22, 8.08,
    8.08, 7.20, 7.83, 7.83, 8.29, 5.99, 8.32, 8.22, 7.38, 7.69, 8.22, 7.31,
    8.25, 8.39, 6.34, 0.16, 0.16, 0.16, 1.66, 7.55, 7.90, 8.18, 8.32, 8.32,
    7.97, 7.97, 8.15, 8.43, 7.83, 8.32, 8.29, 1.03, 7.27, 7.27, 8.08, 7.27,
    0.79, 0.79, 8.22, 7.73, 6.62, 7.62, 8.39, 8.36, 1.73, 8.29, 8.04, 8.22,
    7.83, 7.83, 7.83, 8.32, 8.11, 7.69, 7.55, 7.20, 7.20, 8.01, 8.15, 7.55,
    7.55, 7.97, 7.97, 1.03, 1.03, 1.24, 7.20, 0.47, 0.47, 0.47, 0.79, 8.22,
    7.13, 6.48, 6.48, 7.10, 7.10, 8.01, 8.01, 8.01, 8.01, 5.99, 8.04, 5.22,
    5.82, 5.14, 4.81, 7.62, 5.73, 0.55, 1.31, 0.05, 0.95, 1.59, 7.99, 7.48,
    8.38, 7.12, 2.01, 1.40, 0.00, 9.69, 9.47, 9.25, 2.63, 6.89, 0.56, 3.11
   )
  )

  hdbe <- hdbscan(X, minPts = 3, cluster_selection_epsilon = 1)
  #plot(X, col = hdbe$cluster + 1L, main = "HDBSCAN(e)")

  expect_equal(ncluster(hdbe), 5L)
  expect_equal(nnoise(hdbe), 0L)
})
mhahsler/dbscan documentation built on Aug. 30, 2024, 11:20 p.m.