tests/testthat/test_rptree.R

library(rnndescent)
context("RP Tree")

# just one tree
expected_rpt_knn <- list(
  idx = matrix(
    c(
      1, 6, 10, 0,
      2, 7, 4, 9,
      3, 5, 8, 0,
      4, 9, 2, 7,
      5, 8, 3, 0,
      6, 1, 10, 0,
      7, 2, 4, 9,
      8, 5, 3, 0,
      9, 4, 2, 7,
      10, 6, 1, 0
    ),
    nrow = 10,
    byrow = TRUE
  ),
  dist = matrix(
    c(
      0, 0.3464102, 0.6782330, NA,
      0, 0.3000002, 0.5477225, 0.6164413,
      0, 0.3316626, 0.4690416, NA,
      0, 0.3464101, 0.5477225, 0.6708205,
      0, 0.1732050, 0.3316626, NA,
      0, 0.3464102, 0.5830952, NA,
      0, 0.3000002, 0.6708205, 0.8185353,
      0, 0.1732050, 0.4690416, NA,
      0, 0.3464101, 0.6164413, 0.8185353,
      0, 0.5830952, 0.6782330, NA
    ),
    nrow = 10,
    byrow = TRUE
  )
)

set.seed(1337)
res <- rpf_knn(ui10, k = 4, leaf_size = 4, n_trees = 1)
expect_equal(res, expected_rpt_knn, tol = 1e-7)

set.seed(1337)
res <- rpf_knn(ui10, k = 4)
expect_equal(res$idx, ui10_nn4$idx, check.attributes = FALSE)
expect_equal(res$dist, ui10_nn4$dist, check.attributes = FALSE, tol = 1e-4)

set.seed(1337)
res <- rpf_knn(ui10, k = 4, include_self = FALSE)
expect_equal(res$idx[, 1:3], ui10_nn4$idx[, 2:4], check.attributes = FALSE)
expect_equal(res$dist[, 1:3], ui10_nn4$dist[, 2:4], check.attributes = FALSE, tol = 1e-4)

# euclidean
set.seed(1337)
res <- rpf_knn(ui10, k = 4)
expect_equal(res$idx, ui10_nn4$idx, check.attributes = FALSE)
expect_equal(res$dist, ui10_nn4$dist, check.attributes = FALSE, tol = 1e-4)

# cosine
set.seed(1337)
uiris_rnn <- rpf_knn(uirism, 15, metric = "cosine", n_trees = 40)
expect_equal(sum(uiris_rnn$dist), 1.347357, tol = 1e-3)


# multi-threading
set.seed(1337)
res <- rpf_knn(ui10, k = 4, leaf_size = 3, n_threads = 2, n_trees = 1)
expect_in(c(NA), res$dist)
expect_in(c(0), res$idx)

set.seed(1337)
res <- rpf_knn(ui10, k = 4, n_threads = 2)
expect_equal(res$idx, ui10_nn4$idx, check.attributes = FALSE)
expect_equal(res$dist, ui10_nn4$dist, check.attributes = FALSE, tol = 1e-4)

# euclidean converges
set.seed(1337)
uiris_rnn <- rpf_knn(uiris, 15, n_trees = 40, n_threads = 2)
expect_equal(sum(uiris_rnn$dist), ui_edsum, tol = 1e-3)

# cosine
set.seed(1337)
uiris_rnn <- rpf_knn(uirism, 15, metric = "cosine", n_trees = 40, n_threads = 2)
expect_equal(sum(uiris_rnn$dist), 1.347357, tol = 1e-3)

# R index
expected_rpf_index <- list(
  trees = list(
    list(
      hyperplanes = matrix(c(
        -0.5000000, -0.8000002, -0.2, -0.3,
        0.3000002, -0.3000002, 0.1, -0.2,
        0.0000000, 0.0000000, 0.0, 0.0,
        0.0000000, 0.0000000, 0.0, 0.0,
        0.0000000, 0.0000000, 0.0, 0.0
      ), nrow = 5, byrow = TRUE),
      offsets = c(5.7700009, -0.5550003, NA, NA, NA),
      children = matrix(c(
        1, 4,
        2, 3,
        0, 3,
        3, 7,
        7, 10
      ), nrow = 5, byrow = TRUE),
      indices = c(2, 4, 7, 1, 3, 6, 8, 0, 5, 9),
      leaf_size = 4
    )
  ),
  margin = "explicit",
  actual_metric = "sqeuclidean",
  version = "0.0.12",
  use_alt_metric = TRUE,
  original_metric = "euclidean",
  sparse = FALSE,
  type = "rnndescent:rpforest"
)

set.seed(1337)
rpf_index <- rpf_build(ui10, metric = "euclidean", n_trees = 1, leaf_size = 4)
expect_equal(rpf_index, expected_rpf_index, tol = 1e-7)

# query data against itself to reproduce knn (just more slowly)
set.seed(1337)
rpf_query_res <-
  rpf_knn_query(
    ui10,
    ui10,
    rpf_index,
    k = 4,
    n_threads = 0,
    cache = TRUE
  )
expect_equal(rpf_query_res, expected_rpt_knn, tol = 1e-7)

set.seed(1337)
rpf_query_res <-
  rpf_knn_query(
    ui10,
    ui10,
    rpf_index,
    k = 4,
    n_threads = 0,
    cache = FALSE
  )
expect_equal(rpf_query_res, expected_rpt_knn, tol = 1e-7)

# return forest with knn
set.seed(1337)
rpf_knnf <-
  rpf_knn(
    ui10,
    k = 4,
    metric = "euclidean",
    n_trees = 1,
    ret_forest = TRUE,
    leaf_size = 4
  )
expect_equal(rpf_knnf$forest, expected_rpf_index, tol = 1e-7)
expect_equal(
  list(idx = rpf_knnf$idx, dist = rpf_knnf$dist),
  expected_rpt_knn,
  check.attributes = FALSE,
  tol = 1e-7
)

set.seed(1337)
nnd_with_tree <-
  nnd_knn(
    ui10,
    k = 4,
    ret_forest = TRUE,
    init = "tree",
    init_args = list(n_trees = 1, leaf_size = 4)
  )
expect_equal(nnd_with_tree$forest, expected_rpf_index, tol = 1e-7)

# handle alt metric
set.seed(1337)
nnd_with_tree <-
  nnd_knn(
    ui10,
    k = 4,
    ret_forest = TRUE,
    init = "tree",
    use_alt_metric = FALSE,
    init_args = list(n_trees = 1, leaf_size = 4)
  )
rpf_index_no_alt <- expected_rpf_index
rpf_index_no_alt$use_alt_metric <- FALSE
rpf_index_no_alt$actual_metric <- "euclidean"
expect_equal(nnd_with_tree$forest, rpf_index_no_alt, tol = 1e-7)


# filtering
set.seed(1337)
rpf_knnf3 <-
  rpf_knn(
    ui10,
    k = 4,
    metric = "euclidean",
    n_trees = 3,
    ret_forest = TRUE,
    leaf_size = 4
  )
expect_equal(length(rpf_knnf3$forest$trees), 3)
rpf_f3f <- rpf_filter(rpf_knnf3, n_trees = 1)
expect_equal(length(rpf_f3f$trees), 1)
expect_equal(rpf_f3f$trees[[1]], rpf_knnf3$forest$trees[[1]])


test_that("can't pass mismatched forest and nn to filter", {
  iris_knn_with_forest <-
    rpf_knn(iris[1:50, ], k = 15, ret_forest = TRUE)
  iris_query_virginica <-
    rpf_knn_query(
      query = iris[51:150, ],
      reference = iris[1:50, ],
      forest = iris_knn_with_forest$forest,
      k = 15
    )

  iris_forest <- rpf_build(iris, leaf_size = 15)
  expect_error(
    rpf_filter(
      nn = iris_query_virginica,
      forest = iris_forest,
      n_trees = 1
    ), "Mismatched"
  )
})

set.seed(1337)
expect_equal(
  rpf_build(ui10, metric = "euclidean", leaf_size = 4, n_threads = 0),
  rpf_index_ls4e,
  tol = 1e-7
)

# implicit margin
set.seed(1337)
rpi_knn <- rpf_knn(uirism[1:20, ], k = 4, verbose = FALSE, n_threads = 0, n_trees = 2, margin = "implicit")
set.seed(1337)
rpe_knn <- rpf_knn(uirism[1:20, ], k = 4, verbose = FALSE, n_threads = 0, n_trees = 2, margin = "explicit")
expect_equal(rpe_knn$dist, rpi_knn$dist)

expected_rpfi_index <- list(
  trees = list(list(
    normal_indices = matrix(c(
      4, 0,
      4, 1,
      -1, -1,
      -1, -1,
      -1, -1
    ), nrow = 5, byrow = TRUE),
    children = matrix(c(
      1, 4,
      2, 3,
      0, 3,
      3, 7,
      7, 10
    ), nrow = 5, byrow = TRUE),
    indices = c(2, 4, 7, 1, 3, 6, 8, 0, 5, 9),
    leaf_size = 4
  )),
  margin = "implicit",
  actual_metric = "sqeuclidean",
  version = "0.0.12",
  use_alt_metric = TRUE,
  original_metric = "euclidean",
  sparse = FALSE,
  type = "rnndescent:rpforest"
)
set.seed(1337)
rpf_knn2df <- rpf_knn(
  ui10,
  k = 4,
  metric = "euclidean",
  n_trees = 1,
  ret_forest = TRUE,
  leaf_size = 4,
  margin = "implicit"
)
expect_equal(list(idx = rpf_knn2df$idx, dist = rpf_knn2df$dist), expected_rpt_knn, tol = 1e-7)
expect_equal(rpf_knn2df$forest, expected_rpfi_index)

set.seed(1337)
rpfi_query_res <-
  rpf_knn_query(
    ui10,
    ui10,
    rpf_knn2df$forest,
    k = 4,
    n_threads = 0,
    cache = TRUE
  )
expect_equal(rpfi_query_res, expected_rpt_knn, tol = 1e-7)


set.seed(1337)
rpf_knnfi3 <-
  rpf_knn(
    ui10,
    k = 4,
    metric = "euclidean",
    n_trees = 3,
    ret_forest = TRUE,
    leaf_size = 4,
    margin = "implicit"
  )
expect_equal(length(rpf_knnfi3$forest$trees), 3)
rpf_fi3f <- rpf_filter(rpf_knnfi3, n_trees = 1)
expect_equal(length(rpf_fi3f$trees), 1)
expect_equal(rpf_fi3f$trees[[1]], rpf_knnfi3$forest$trees[[1]])
expect_equal(rpf_fi3f$margin, rpf_knnfi3$forest$margin)
expect_equal(rpf_fi3f$actual_metric, rpf_knnfi3$forest$actual_metric)
expect_equal(rpf_fi3f$version, rpf_knnfi3$forest$version)
expect_equal(rpf_fi3f$use_alt_metric, rpf_knnfi3$forest$use_alt_metric)
expect_equal(rpf_fi3f$original_metric, rpf_knnfi3$forest$original_metric)


set.seed(1337)
rpf_knnff3 <-
  rpf_knn(
    ui10,
    k = 4,
    metric = "euclidean",
    n_trees = 3,
    ret_forest = TRUE,
    leaf_size = 4,
    margin = "explicit"
  )
expect_equal(length(rpf_knnff3$forest$trees), 3)
rpf_ff3f <- rpf_filter(rpf_knnff3, n_trees = 1)
expect_equal(length(rpf_ff3f$trees), 1)
expect_equal(rpf_ff3f$trees[[1]], rpf_knnff3$forest$trees[[1]])
expect_equal(rpf_ff3f$margin, rpf_knnff3$forest$margin)
expect_equal(rpf_ff3f$actual_metric, rpf_knnff3$forest$actual_metric)
expect_equal(rpf_ff3f$version, rpf_knnff3$forest$version)
expect_equal(rpf_ff3f$use_alt_metric, rpf_knnff3$forest$use_alt_metric)
expect_equal(rpf_ff3f$original_metric, rpf_knnff3$forest$original_metric)

set.seed(1337)
expect_equal(
  rpf_build(ui10, metric = "euclidean", leaf_size = 4, margin = "implicit", n_threads = 0),
  rpf_index_ls4i,
  tol = 1e-7
)

set.seed(1337)
rpf_index_ls4i_no_alt <- rpf_index_ls4i
rpf_index_ls4i_no_alt$use_alt_metric <- FALSE
rpf_index_ls4i_no_alt$actual_metric <- "euclidean"

expect_equal(
  rpf_build(ui10, metric = "euclidean", use_alt_metric = FALSE, leaf_size = 4, margin = "implicit", n_threads = 0),
  rpf_index_ls4i_no_alt,
  tol = 1e-7
)

# cosine test
set.seed(1337)
uiriscos <-
  rpf_knn(
    uirism,
    k = 15,
    metric = "cosine",
    n_threads = 0,
    ret_forest = TRUE,
    n_trees = 1
  )
set.seed(1337)
uiriscosq <-
  rpf_knn_query(
    uirism,
    uirism,
    uiriscos$forest,
    k = 15,
    n_threads = 0,
  )
# handle ties where indices swap places
expect_equal(sum(uiriscos$idx - uiriscosq$idx), 0)

# test uncached
uiriscosq_nocache <- rpf_knn_query(
  uirism,
  uirism,
  uiriscos$forest,
  k = 15,
  n_threads = 2,
  cache = FALSE
)
expect_equal(sum(uiriscos$idx - uiriscosq_nocache$idx), 0)

set.seed(1337)
uiriscosi <-
  rpf_knn(
    uirism,
    k = 15,
    metric = "cosine",
    n_threads = 0,
    ret_forest = TRUE,
    margin = "implicit",
    n_trees = 1
  )
set.seed(1337)
uiriscosiq <-
  rpf_knn_query(
    uirism,
    uirism,
    uiriscosi$forest,
    k = 15,
    n_threads = 0,
  )
expect_equal(sum(uiriscosi$idx - uiriscosq$idx), 0)
expect_equal(sum(uiriscosi$idx - uiriscosiq$idx), 0)

set.seed(1337)
ui6f <- rpf_knn(
  ui6,
  k = 4,
  leaf_size = 3,
  ret_forest = TRUE
)
qnbrs4 <- graph_knn_query(reference = ui6, reference_graph = ui6f, query = ui4, init = ui6f$forest, k = 4)
expect_equal(sum(qnbrs4$dist), ui4q_edsum, tol = 1e-6)

test_that("binary data", {
  # euclidean forces conversion to float data
  set.seed(1337)
  bin_euc_imp <- rpf_knn(lbitdata, k = 4, margin = "implicit")
  set.seed(1337)
  bin_euc_exp <- rpf_knn(lbitdata, k = 4, margin = "explicit")
  expect_equal(bin_euc_imp, bin_euc_exp)

  set.seed(1337)
  bin_jac_imp <- rpf_knn(lbitdata, k = 4, margin = "implicit", metric = "jaccard")
  set.seed(1337)
  bin_jac_exp <- rpf_knn(lbitdata, k = 4, margin = "explicit", metric = "jaccard")
  expect_equal(bin_jac_imp, bin_jac_exp)
  set.seed(1337)
  bin_jac_aut <- rpf_knn(lbitdata, k = 4, margin = "auto", metric = "jaccard")
  expect_equal(bin_jac_aut, bin_jac_imp)

  set.seed(1337)
  euc_forest_i <- rpf_build(lbitdata, leaf_size = 10, margin = "implicit")
  bin_euc_impq <-
    rpf_knn_query(
      query = lbitdata,
      reference = lbitdata,
      forest = euc_forest_i,
      k = 4
    )
  expect_equal(bin_euc_impq, bin_euc_imp)

  set.seed(1337)
  euc_forest_e <- rpf_build(lbitdata, leaf_size = 10, margin = "explicit")
  bin_euc_expq <-
    rpf_knn_query(
      query = lbitdata,
      reference = lbitdata,
      forest = euc_forest_e,
      k = 4
    )
  expect_equal(bin_euc_expq, bin_euc_exp)

  set.seed(1337)
  euc_forest_a <- rpf_build(lbitdata, leaf_size = 10, margin = "auto")
  bin_euc_autq <-
    rpf_knn_query(
      query = lbitdata,
      reference = lbitdata,
      forest = euc_forest_a,
      k = 4
    )
  expect_equal(bin_euc_autq, bin_euc_exp)
})

test_that("sparse implicit margin", {
  set.seed(1337)
  dknn <- rpf_knn(ui10z, k = 4, leaf_size = 3, n_trees = 2, margin = "implicit")
  set.seed(1337)
  sknn <- rpf_knn(ui10sp, k = 4, leaf_size = 3, n_trees = 2, margin = "implicit")
  expect_equal(sknn, dknn)

  set.seed(1337)
  dknn <- rpf_knn(ui10z, k = 4, leaf_size = 3, n_trees = 2, margin = "implicit", ret_forest = TRUE)
  set.seed(1337)
  sknn <- rpf_knn(ui10sp, k = 4, leaf_size = 3, n_trees = 2, margin = "implicit", ret_forest = TRUE)
  expect_equal(list(idx = sknn$idx, dist = sknn$dist), list(idx = dknn$idx, dist = dknn$dist))
  dknn$forest$sparse <- TRUE
  expect_equal(sknn$forest, dknn$forest)

  set.seed(1337)
  dforest <- rpf_build(ui10z, leaf_size = 3, n_trees = 2, margin = "implicit", metric = "cosine")
  set.seed(1337)
  sforest <- rpf_build(ui10sp, leaf_size = 3, n_trees = 2, margin = "implicit", metric = "cosine")
  expect_equal(sforest$actual_metric, "alternative-cosine")
  expect_true(sforest$sparse)
  sforest$sparse <- FALSE
  expect_equal(sforest, dforest)

  set.seed(1337)
  dforest6 <- rpf_build(ui10z6, leaf_size = 3, n_trees = 2, margin = "implicit", metric = "cosine")
  set.seed(1337)
  dquery4 <- rpf_knn_query(query = ui10z4, reference = ui10z6, forest = dforest6, k = 4)
  expect_error(squery4 <- rpf_knn_query(query = ui10sp4, reference = ui10sp6, forest = dforest6, k = 4), "sparse forest")
  # hack the forest to force it to work with sparse
  dforest6$sparse <- TRUE
  set.seed(1337)
  squery4 <- rpf_knn_query(query = ui10sp4, reference = ui10sp6, forest = dforest6, k = 4)
  expect_equal(squery4, dquery4, tol = 1e-4)

  set.seed(1337)
  sforest6 <- rpf_build(ui10sp6, leaf_size = 3, n_trees = 2, margin = "implicit", metric = "cosine")
  set.seed(1337)
  squery4b <- rpf_knn_query(query = ui10sp4, reference = ui10sp6, forest = sforest6, k = 4)
  expect_equal(squery4b, squery4, tol = 1e-5)

  set.seed(1337)
  squery4b <- rpf_knn_query(query = ui10sp4, reference = ui10sp6, forest = sforest6, k = 4, cache = FALSE)
  expect_equal(squery4b, squery4, tol = 1e-5)
})


test_that("sparse explicit margin", {
  set.seed(1337)
  dknn <- rpf_knn(ui10z, k = 4, leaf_size = 3, n_trees = 2, margin = "explicit")
  set.seed(1337)
  sknn <- rpf_knn(ui10sp, k = 4, leaf_size = 3, n_trees = 2, margin = "explicit")
  expect_equal(sknn, dknn)

  # implict and explicit should give the same results for euclidean
  set.seed(1337)
  siknn <- rpf_knn(ui10sp, k = 4, leaf_size = 3, n_trees = 2, margin = "implicit")
  expect_equal(siknn, sknn)

  set.seed(1337)
  sknn6 <- rpf_knn(ui10sp6, k = 4, leaf_size = 2, n_trees = 2, ret_forest = TRUE)
  set.seed(1337)
  sforest6 <- rpf_build(ui10sp6, leaf_size = 2, n_trees = 2)
  expect_equal(sforest6$margin, "explicit")
  expect_equal(sknn6$forest$margin, "explicit")
  expect_equal(sforest6, sknn6$forest)

  s6_ff <- rpf_filter(sknn6)
  expect_equal(length(s6_ff$trees), 1)
  expect_equal(s6_ff$trees[[1]], sknn6$forest$trees[[1]])

  set.seed(1337)
  res_forest <- rpf_knn_query(ui10sp4, ui10sp6, forest = sforest6, k = 4)
  set.seed(1337)
  res_knnforest <- rpf_knn_query(ui10sp4, ui10sp6, forest = sknn6$forest, k = 4)
  expect_equal(res_forest, res_knnforest)

  set.seed(1337)
  dknn6 <- rpf_knn(ui10z6, k = 4, leaf_size = 2, n_trees = 2, margin = "explicit", ret_forest = TRUE)
  set.seed(1337)
  res_dknn <- rpf_knn_query(ui10z4, ui10z6, forest = dknn6$forest, k = 4)
  expect_equal(res_forest, res_dknn)

  # uncached
  set.seed(1337)
  res_dknn_nocache <- rpf_knn_query(ui10z4, ui10z6, forest = dknn6$forest, k = 4, cache = FALSE)
  expect_equal(res_forest, res_dknn_nocache)

  # implict and explicit should give the same results for cosine also
  set.seed(1337)
  secknn <- rpf_knn(ui10sp, k = 4, leaf_size = 3, n_trees = 2, margin = "explicit", metric = "cosine")
  set.seed(1337)
  sicknn <- rpf_knn(ui10sp, k = 4, leaf_size = 3, n_trees = 2, margin = "implicit", metric = "cosine")
  expect_equal(secknn, sicknn)

  set.seed(1337)
  sacknn <- rpf_knn(ui10sp,
    k = 4, leaf_size = 3, n_trees = 2,
    margin = "explicit", metric = "cosine", use_alt_metric = FALSE
  )
  expect_equal(sacknn, secknn, tol = 1e-4)
})
jlmelville/rnndescent documentation built on April 19, 2024, 8:26 p.m.