tests/testthat/test-bk_method.R

# library(testthat)
# library(dendextend)

context("Bk method (FM Index) between two trees")


test_that("sort_2_clusters_vectors works", {
  suppressWarnings(RNGversion("3.5.0"))
  set.seed(23235)
  ss <- sample(1:150, 4)
  hc1 <- hclust(dist(datasets::iris[ss, -5]), "com")
  hc2 <- hclust(dist(datasets::iris[ss, -5]), "single")
  # dend1 <- as.dendrogram(hc1)
  # dend2 <- as.dendrogram(hc2)
  #    cutree(dend1)

  A1_clusters <- cutree(hc1, k = 3)
  A2_clusters <- sample(cutree(hc1, k = 3))

  sorted_As <- sort_2_clusters_vectors(A1_clusters, A2_clusters, assume_sorted_vectors = TRUE) # no sorting
  expect_false(identical(sorted_As[[1]], sorted_As[[2]]))

  sorted_As <- sort_2_clusters_vectors(A1_clusters, A2_clusters, assume_sorted_vectors = FALSE) # Sorted
  expect_true(identical(sorted_As[[1]], sorted_As[[2]]))
})




test_that("FM_index_R works", {
  suppressWarnings(RNGversion("3.5.0"))

  #    set.seed(23235)
  ss <- TRUE # sample(1:150, 10 )
  hc1 <- hclust(dist(datasets::iris[ss, -5]), "com")
  hc2 <- hclust(dist(datasets::iris[ss, -5]), "single")
  # dend1 <- as.dendrogram(hc1)
  # dend2 <- as.dendrogram(hc2)
  #    cutree(dend1)

  # FM index of a cluster with himself is 1:
  expect_equivalent(FM_index_R(cutree(hc1, k = 3), cutree(hc1, k = 3)), 1)
  # removing the attr - it is EXACTLY 1: (but NOT 1L)
  expect_identical(as.vector(FM_index_R(cutree(hc1, k = 3), cutree(hc1, k = 3))), 1)

  # sorting of the clusters based on their names:
  set.seed(1341)
  expect_false(identical(
    as.vector(
      FM_index_R(cutree(hc1, k = 3),
        sample(cutree(hc1, k = 3)),
        assume_sorted_vectors = TRUE
      )
    ), 1
  ))
  # It actually becomes: 0.38037
  # but if we leave sorting as TRUE, we will get 1:

  expect_true(identical(
    as.vector(FM_index_R(cutree(hc1, k = 3),
      sample(cutree(hc1, k = 3)),
      assume_sorted_vectors = FALSE
    )), 1
  ))
  # it is actually the default:
  expect_true(identical(
    as.vector(FM_index_R(
      cutree(hc1, k = 3),
      sample(cutree(hc1, k = 3))
    )), 1
  ))


  # we can get a range of FM inexes using the following:
  fo <- function(k) FM_index_R(cutree(hc1, k), cutree(hc2, k))
  #    dput(round(unname(sapply(1:4, fo)),2))

  expect_identical(
    round(unname(sapply(1:4, fo)), 2),
    c(1, 0.71, 0.81, 0.75)
  )
  #    ks <- 1:150
  #    plot(sapply(ks, fo)~ ks, type = "b", main = "Bk plot for the datasets::iris dataset")
  #

  set.seed(234234)
  tmp_index <- FM_index_R(cutree(hc1, k = 3), sample(cutree(hc1, k = 3)), assume_sorted_vectors = TRUE)
  expect_identical(round(as.vector(tmp_index), 2), 0.38)
  expect_identical(round(attr(tmp_index, "E_FM"), 2), 0.37)
  expect_identical(round(sqrt(attr(tmp_index, "V_FM")), 3), 0.008)
})



test_that("FM_index works", {
  suppressWarnings(RNGversion("3.5.0"))
  #    set.seed(23235)
  ss <- TRUE # sample(1:150, 10 )
  hc1 <- hclust(dist(datasets::iris[ss, -5]), "com")
  hc2 <- hclust(dist(datasets::iris[ss, -5]), "single")
  # dend1 <- as.dendrogram(hc1)
  # dend2 <- as.dendrogram(hc2)
  #    cutree(dend1)

  # FM index of a cluster with himself is 1:
  expect_equivalent(FM_index(cutree(hc1, k = 3), cutree(hc1, k = 3)), 1)
  # removing the attr - it is EXACTLY 1: (but NOT 1L)
  expect_identical(as.vector(FM_index(cutree(hc1, k = 3), cutree(hc1, k = 3))), 1)

  # sorting of the clusters based on their names:
  set.seed(1341)
  expect_false(identical(
    as.vector(
      FM_index(cutree(hc1, k = 3),
        sample(cutree(hc1, k = 3)),
        assume_sorted_vectors = TRUE
      )
    ), 1
  ))
  # It actually becomes: 0.38037
  # but if we leave sorting as TRUE, we will get 1:

  expect_true(identical(
    as.vector(FM_index(cutree(hc1, k = 3),
      sample(cutree(hc1, k = 3)),
      assume_sorted_vectors = FALSE
    )), 1
  ))
  # it is actually the default:
  expect_true(identical(
    as.vector(FM_index(
      cutree(hc1, k = 3),
      sample(cutree(hc1, k = 3))
    )), 1
  ))


  # we can get a range of FM inexes using the following:
  fo <- function(k) FM_index(cutree(hc1, k), cutree(hc2, k))
  #    dput(round(unname(sapply(1:4, fo)),2))

  expect_identical(
    round(unname(sapply(1:4, fo)), 2),
    c(1, 0.71, 0.81, 0.75)
  )
  #    ks <- 1:150
  #    plot(sapply(ks, fo)~ ks, type = "b", main = "Bk plot for the datasets::iris dataset")
  
})







test_that("FM_index_permutation works", {
  set.seed(23235)
  ss <- TRUE # sample(1:150, 10 )
  hc1 <- hclust(dist(datasets::iris[ss, -5]), "com")
  hc2 <- hclust(dist(datasets::iris[ss, -5]), "single")
  # dend1 <- as.dendrogram(hc1)
  # dend2 <- as.dendrogram(hc2)
  #    cutree(dend1)

  A1_clusters <- cutree(hc1, k = 3)
  A2_clusters <- A1_clusters

  R <- 10
  set.seed(414130)
  FM_index_H0 <- replicate(R, FM_index_permutation(A1_clusters, A2_clusters)) # can take 10 sec

  expect_identical(
    round(mean(FM_index_H0), 2),
    0.37
  )
  expect_identical(
    round(sd(FM_index_H0), 2),
    0.01
  )
})








test_that("Bk works", {
  set.seed(23235)
  ss <- TRUE # sample(1:150, 10 )
  hc1 <- hclust(dist(datasets::iris[ss, -5]), "com")
  hc2 <- hclust(dist(datasets::iris[ss, -5]), "single")
  dend1 <- as.dendrogram(hc1)
  dend2 <- as.dendrogram(hc2)
  #    cutree(tree1)

  expect_identical(
    Bk(hc1, hc2, k = 3),
    Bk(dend1, dend2, k = 3, )
  )

  expect_identical(
    Bk(hc1, hc2, k = 3),
    Bk(dend1, dend2, k = 3)
  )

  expect_identical(
    round(as.numeric(Bk(hc1, hc2, k = 3)), 3),
    0.806
  )
  expect_identical(
    round(as.numeric(Bk(dend1, dend2, k = 3)), 3),
    0.806
  )


  set.seed(23234535)
  expect_identical(
    round(
      Bk_permutations(dend1,
        dend2,
        k = 3, R = 2
      )[[1]], 2
    ),
    c(.45, .44)
  )



  #    Bk_plot(hc1, hc2, main = "WRONG Bk plot \n(due to the way cutree works with ties)")
  #    Bk_plot(dend1, dend2, main = "CORRECT Bk plot")
})
talgalili/dendextend documentation built on Jan. 27, 2024, 7:43 p.m.