tests/testthat/helper_data.R

# ten iris entries where the 4 nearest neighbors are distinct
uiris <- unique(iris)
uirism <- as.matrix(uiris[, -5])
ui10 <- uirism[6:15, ]

ui10sp_full <- Matrix::drop0(ui10)

set.seed(1337)
ui10z <- ui10
ui10z[sample(prod(dim(ui10z)), 10)] <- 0
ui10sp <- Matrix::drop0(ui10z)

ui10z6 <- head(ui10z, 6)
ui10z4 <- tail(ui10z, 4)

ui10sp6 <- head(ui10sp, 6)
ui10sp4 <- tail(ui10sp, 4)

# treat sum of distances an objective function
# expected sum from sum(FNN::get.knn(uirism, 14)$nn.dist)
ui_edsum <- 1016.834
# sum(FNN::get.knn(ui10, 3)$nn.dist)
ui10_edsum <- 13.28425

ui6 <- ui10[1:6, ]
ui4 <- ui10[7:10, ]
ui10_eucd <- as.matrix(dist(ui10))

# sum(RcppHNSW::hnsw_search(ui4, RcppHNSW::hnsw_build(ui6), k = 4)$dist)
ui4q_edsum <- 9.310494
# sum(RcppHNSW::hnsw_search(ui6, RcppHNSW::hnsw_build(ui4), k = 4)$dist)
ui6q_edsum <- 18.98666

# NB Annoy and HNSW don't agree to more than this # of decimal places
# sum(RcppHNSW::hnsw_search(ui4, RcppHNSW::hnsw_build(ui6, distance = "cosine"), k = 4)$dist)
ui4q_cdsum <- 0.02072
# sum(RcppHNSW::hnsw_search(ui6, RcppHNSW::hnsw_build(ui4, distance = "cosine"), k = 4)$dist)
ui6q_cdsum <- 0.04220

# Manhattan: Taken from RcppAnnoy
ui4q_mdsum <- 15.4
ui6q_mdsum <- 31.6

# Hamming
bitm <- function(nrow, ncol, prob = 0.5) {
  matrix(rbinom(n = nrow * ncol, size = 1, prob = prob), ncol = ncol)
}

set.seed(1337)
bitdata <- bitm(nrow = 10, ncol = 160)
intdata <- matrix(sample.int(5, 40, replace = TRUE), 10)

bitdatasp <- Matrix::drop0(bitdata)
lbitdata <- matrix(as.logical(bitdata), nrow = nrow(bitdatasp))

bit6 <- bitdata[1:6, ]
bit4 <- bitdata[7:10, ]

lbit6 <- matrix(as.logical(bit6), nrow = nrow(bit6))
lbit4 <- matrix(as.logical(bit4), nrow = nrow(bit4))

# Hamming
# from Annoy
expected_hamm_idx <- matrix(
  c(
    1, 7, 4, 5,
    2, 10, 3, 9,
    3, 4, 2, 7,
    4, 3, 1, 7,
    5, 6, 7, 1,
    6, 5, 10, 3,
    7, 1, 10, 5,
    8, 9, 10, 7,
    9, 8, 10, 4,
    10, 2, 9, 7
  ),
  byrow = TRUE, nrow = 10, ncol = 4
)

# distances normalized wrt ndim for consistency with PyNNDescent
expected_hamm_dist <- matrix(
  c(
    0, 72, 74, 77,
    0, 69, 78, 79,
    0, 65, 78, 79,
    0, 65, 74, 76,
    0, 67, 75, 77,
    0, 67, 80, 81,
    0, 72, 74, 75,
    0, 69, 77, 81,
    0, 69, 72, 78,
    0, 69, 72, 74
  ),
  byrow = TRUE, nrow = 10, ncol = 4
) / ncol(bitdata)

int6 <- intdata[1:6, ]
int4 <- intdata[7:10, ]

int6hd <- matrix(c(
  0,   2,   3,   3,   3,   2,
  2,   0,   2,   2,   4,   4,
  3,   2,   0,   4,   4,   4,
  3,   2,   4,   0,   3,   3,
  3,   4,   4,   3,   0,   2,
  2,   4,   4,   3,   2,   0
), nrow = 6) / ncol(int6)

# Taken from RcppAnnoy (and then normalize wrt num features for consistency with PyNNDescent)
bit4q_hdsum <- 1275 / ncol(bitdata)
bit6q_hdsum <- 1986 / ncol(bitdata)

# Distance matrices generated with Annoy

ui10_cosd <- matrix(c(
  0.0000000, 0.0001315, 0.0007371, 1.131e-03, 2.628e-03, 0.0009933, 0.0004830, 2.966e-03, 1.830e-03, 0.0039657,
  0.0001315, 0.0000000, 0.0009095, 1.648e-03, 3.132e-03, 0.0009375, 0.0006808, 3.429e-03, 1.426e-03, 0.0033142,
  0.0007371, 0.0009095, 0.0000000, 2.441e-04, 6.824e-04, 0.0001688, 0.0004221, 8.124e-04, 8.191e-04, 0.0027981,
  0.0011313, 0.0016477, 0.0002441, 1.372e-09, 3.832e-04, 0.0007671, 0.0005494, 6.114e-04, 1.888e-03, 0.0043782,
  0.0026276, 0.0031324, 0.0006824, 3.832e-04, 0.000e+00, 0.0011068, 0.0014791, 7.131e-05, 2.038e-03, 0.0043755,
  0.0009933, 0.0009375, 0.0001688, 7.671e-04, 1.107e-03, 0.0000000, 0.0009307, 1.066e-03, 2.660e-04, 0.0016037,
  0.0004830, 0.0006808, 0.0004221, 5.494e-04, 1.479e-03, 0.0009307, 0.0000000, 1.947e-03, 1.850e-03, 0.0047715,
  0.0029658, 0.0034286, 0.0008124, 6.114e-04, 7.131e-05, 0.0010661, 0.0019470, 0.000e+00, 1.851e-03, 0.0037512,
  0.0018304, 0.0014256, 0.0008191, 1.888e-03, 2.038e-03, 0.0002660, 0.0018498, 1.851e-03, 1.945e-09, 0.0007827,
  0.0039657, 0.0033142, 0.0027981, 4.378e-03, 4.376e-03, 0.0016037, 0.0047715, 3.751e-03, 7.827e-04, 0.0000000
), nrow = 10)

ui10_mand <- matrix(c(
  0.0,  1.7,  1.3, 2.5, 1.8, 0.6, 1.4, 2.1, 2.9,  1.2,
  1.7,  0.0,  0.6, 0.8, 0.9, 1.3, 0.5, 0.8, 1.2,  2.1,
  1.3,  0.6,  0.0, 1.2, 0.5, 0.7, 0.3, 0.8, 1.6,  1.7,
  2.5,  0.8,  1.2, 0.0, 0.9, 1.9, 1.1, 0.6, 0.6,  2.7,
  1.8,  0.9,  0.5, 0.9, 0.0, 1.2, 0.6, 0.3, 1.1,  2.2,
  0.6,  1.3,  0.7, 1.9, 1.2, 0.0, 1.0, 1.5, 2.3,  1.0,
  1.4,  0.5,  0.3, 1.1, 0.6, 1.0, 0.0, 0.7, 1.5,  2.0,
  2.1,  0.8,  0.8, 0.6, 0.3, 1.5, 0.7, 0.0, 0.8,  2.3,
  2.9,  1.2,  1.6, 0.6, 1.1, 2.3, 1.5, 0.8, 0.0,  2.7,
  1.2,  2.1,  1.7, 2.7, 2.2, 1.0, 2.0, 2.3, 2.7,  0.0
), nrow = 10)

bit10_hamd <- matrix(c(
  0, 85, 81, 74, 77, 90, 72, 93, 90, 90,
  85, 0, 78, 83, 82, 83, 89, 84, 79, 69,
  81, 78, 0, 65, 86, 81, 79, 82, 81, 89,
  74, 83, 65, 0, 83, 90, 76, 85, 78, 92,
  77, 82, 86, 83, 0, 67, 75, 82, 83, 81,
  90, 83, 81, 90, 67, 0, 82, 87, 88, 80,
  72, 89, 79, 76, 75, 82, 0, 81, 92, 74,
  93, 84, 82, 85, 82, 87, 81, 0, 69, 77,
  90, 79, 81, 78, 83, 88, 92, 69, 0, 72,
  90, 69, 89, 92, 81, 80, 74, 77, 72, 0
), nrow = 10) / ncol(bitdata)

# for uirism[1:10, ]
uirism10_cord <- matrix(
  c(
    0, 0.00400133876, 2.60889537e-05, 0.00183154822, 0.0006526685,
    0.000413946853, 0.00118880691, 0.000461852891, 0.00192335771,
    0.00344803882, 0.00400133876, 0, 0.00339291433, 0.00260336976,
    0.00776732119, 0.00640810993, 0.00927944638, 0.00288194472, 0.00145365862,
    0.00096714455, 2.60889537e-05, 0.00339291433, 0, 0.00166652079,
    0.000938868047, 0.000622703492, 0.00156232107, 0.000395491414,
    0.00164390757, 0.00301440442, 0.00183154822, 0.00260336976, 0.00166652079,
    0, 0.00328117923, 0.00216742062, 0.00386063303, 0.000454440488,
    0.000166690505, 0.000693152364, 0.0006526685, 0.00776732119,
    0.000938868047, 0.00328117923, 0, 0.000116680316, 8.60443273e-05,
    0.00149684289, 0.00396913822, 0.00623882524, 0.000413946853,
    0.00640810993, 0.000622703492, 0.00216742062, 0.000116680316,
    0, 0.000277394147, 0.000821174669, 0.00278434617, 0.00473942264,
    0.00118880691, 0.00927944638, 0.00156232107, 0.00386063303, 8.60443273e-05,
    0.000277394147, 1.11022302e-16, 0.00204787836, 0.00478602797,
    0.0072727783, 0.000461852891, 0.00288194472, 0.000395491414,
    0.000454440488, 0.00149684289, 0.000821174669, 0.00204787836,
    0, 0.000593789371, 0.00162634825, 0.00192335771, 0.00145365862,
    0.00164390757, 0.000166690505, 0.00396913822, 0.00278434617,
    0.00478602797, 0.000593789371, 0, 0.000260225275, 0.00344803882,
    0.00096714455, 0.00301440442, 0.000693152364, 0.00623882524,
    0.00473942264, 0.0072727783, 0.00162634825, 0.000260225275, 0
  ),
  nrow = 10
)

ui10_nn4 <- list(
  idx = matrix(
    c(
      1, 6, 10, 3,
      2, 7, 3, 5,
      3, 7, 5, 2,
      4, 9, 8, 2,
      5, 8, 3, 7,
      6, 1, 3, 10,
      7, 3, 2, 5,
      8, 5, 4, 7,
      9, 4, 8, 2,
      10, 6, 1, 3
    ),
    byrow = TRUE, ncol = 4
  ),
  dist = matrix(
    c(
      0, 0.3464, 0.6782, 0.7,
      0, 0.3, 0.4243, 0.4796,
      0, 0.2236, 0.3317, 0.4243,
      0, 0.3464, 0.4243, 0.5477,
      0, 0.1732, 0.3317, 0.3464,
      0, 0.3464, 0.5, 0.5831,
      0, 0.2236, 0.3, 0.3464,
      0, 0.1732, 0.4243, 0.4583,
      0, 0.3464, 0.5831, 0.6164,
      0, 0.5831, 0.6782, 1.044
    ),
    byrow = TRUE, ncol = 4
  )
)

# set.seed(1337)
# rpf_build(ui10, metric = "euclidean", leaf_size = 4, margin = "explicit")
rpf_index_ls4e <-
  list(
    trees = list(
      list(hyperplanes = structure(c(
        -0.5, 0.300000190734863,
        0, 0, 0, -0.800000190734863, -0.300000190734863, 0, 0, 0, -0.200000047683716,
        0.100000023841858, 0, 0, 0, -0.300000011920929, -0.200000017881393,
        0, 0, 0
      ), dim = 5:4), offsets = c(
        5.77000093460083, -0.555000305175781,
        NaN, NaN, NaN
      ), children = structure(c(
        1L, 2L, 0L, 3L, 7L, 4L,
        3L, 3L, 7L, 10L
      ), dim = c(5L, 2L)), indices = c(
        2L, 4L, 7L, 1L,
        3L, 6L, 8L, 0L, 5L, 9L
      ), leaf_size = 4), list(
        hyperplanes = structure(c(
          0.599999904632568,
          0.0999999046325684, 0.400000095367432, -0.400000095367432, 0,
          0, 0, 0, 0, 0.0999999046325684, 0.300000190734863, 0, -0.299999952316284,
          0, 0, 0, 0, 0, 0.399999976158142, 0, 0.100000023841858, 0, 0,
          0, 0, 0, 0, 0, 0.100000001490116, -0.100000008940697, 0, 0, 0,
          0, 0, 0
        ), dim = c(9L, 4L)), offsets = c(
          -3.58499956130981, -1.4850001335144,
          -2.04000043869019, 3.14500045776367, NaN, NaN, NaN, NaN, NaN
        ),
        children = structure(c(
          1L, 2L, 3L, 4L, 0L, 2L, 5L, 6L, 8L,
          8L, 7L, 6L, 5L, 2L, 5L, 6L, 8L, 10L
        ), dim = c(9L, 2L)), indices = c(
          2L,
          6L, 0L, 5L, 9L, 1L, 4L, 7L, 3L, 8L
        ), leaf_size = 3
      ), list(
        hyperplanes = structure(c(
          1.09999990463257, -0.599999904632568,
          0, 0, 0.5, 0, 0, 0.700000047683716, -0.5, 0, 0, 0.199999809265137,
          0, 0, 0.399999976158142, -0.100000023841858, 0, 0, 0.100000023841858,
          0, 0, 0.100000001490116, -0.200000002980232, 0, 0, -0.100000001490116,
          0, 0
        ), dim = c(7L, 4L)), offsets = c(
          -8.21500015258789, 5.10999965667725,
          NaN, NaN, -3.05499935150146, NaN, NaN
        ), children = structure(c(
          1L,
          2L, 0L, 2L, 5L, 5L, 8L, 4L, 3L, 2L, 5L, 6L, 8L, 10L
        ), dim = c(
          7L,
          2L
        )), indices = c(2L, 6L, 0L, 5L, 9L, 1L, 4L, 7L, 3L, 8L),
        leaf_size = 3
      ), list(hyperplanes = structure(c(
        -0.0999999046325684,
        0, 0.800000190734863, 0, 0, -0.0999999046325684, 0, 0.299999952316284,
        0, 0, -0.100000023841858, 0, 0.100000023841858, 0, 0, 0, 0, -0.100000008940697,
        0, 0
      ), dim = 5:4), offsets = c(
        0.934999287128448, NaN, -5.18500089645386,
        NaN, NaN
      ), children = structure(c(
        1L, 0L, 3L, 3L, 6L, 2L, 3L,
        4L, 6L, 10L
      ), dim = c(5L, 2L)), indices = c(
        3L, 7L, 8L, 0L, 5L,
        9L, 1L, 2L, 4L, 6L
      ), leaf_size = 4), list(hyperplanes = structure(c(
        0.300000190734863,
        0.599999904632568, 0, 0, 0.400000095367432, 0, 0, -0.300000190734863,
        0.700000047683716, 0, 0, 0.5, 0, 0, 0.100000023841858, 0.100000023841858,
        0, 0, 0.200000047683716, 0, 0, -0.200000017881393, 0.100000001490116,
        0, 0, 0, 0, 0
      ), dim = c(7L, 4L)), offsets = c(
        -0.555000305175781,
        -5.5649995803833, NaN, NaN, -3.71500062942505, NaN, NaN
      ), children = structure(c(
        1L,
        2L, 0L, 2L, 5L, 5L, 8L, 4L, 3L, 2L, 5L, 6L, 8L, 10L
      ), dim = c(
        7L,
        2L
      )), indices = c(5L, 9L, 2L, 4L, 7L, 0L, 1L, 6L, 3L, 8L), leaf_size = 3),
      list(hyperplanes = structure(c(
        -0.599999904632568, -0.0999999046325684,
        0, 0, 0, -0.5, -0.0999999046325684, 0, 0, 0, -0.100000023841858,
        -0.100000023841858, 0, 0, 0, -0.200000002980232, 0, 0, 0,
        0
      ), dim = 5:4), offsets = c(
        5.10999965667725, 0.934999287128448,
        NaN, NaN, NaN
      ), children = structure(c(
        1L, 2L, 0L, 3L, 7L,
        4L, 3L, 3L, 7L, 10L
      ), dim = c(5L, 2L)), indices = c(
        3L, 7L,
        8L, 1L, 2L, 4L, 6L, 0L, 5L, 9L
      ), leaf_size = 4), list(hyperplanes = structure(c(
        0.900000095367432,
        0, -0.599999904632568, 0, 0, 0.900000095367432, 0, -0.5,
        0, 0, -0.299999952316284, 0, -0.100000023841858, 0, 0, 0.100000001490116,
        0, 0, 0, 0
      ), dim = 5:4), offsets = c(
        -7.62000131607056, NaN,
        4.53999948501587, NaN, NaN
      ), children = structure(c(
        1L, 0L,
        3L, 3L, 6L, 2L, 3L, 4L, 6L, 10L
      ), dim = c(5L, 2L)), indices = c(
        0L,
        5L, 9L, 3L, 7L, 8L, 1L, 2L, 4L, 6L
      ), leaf_size = 4)
    ), margin = "explicit",
    actual_metric = "sqeuclidean", version = "0.0.12", use_alt_metric = TRUE,
    original_metric = "euclidean", sparse = FALSE, type = "rnndescent:rpforest"
  )

# set.seed(1337)
# rpf_build(ui10, metric = "euclidean", leaf_size = 4, margin = "implicit")
rpf_index_ls4i <-
  list(
    trees = list(list(normal_indices = structure(c(
      4L, 4L, -1L,
      -1L, -1L, 0L, 1L, -1L, -1L, -1L
    ), dim = c(5L, 2L)), children = structure(c(
      1L,
      2L, 0L, 3L, 7L, 4L, 3L, 3L, 7L, 10L
    ), dim = c(5L, 2L)), indices = c(
      2L,
      4L, 7L, 1L, 3L, 6L, 8L, 0L, 5L, 9L
    ), leaf_size = 4), list(normal_indices = structure(c(
      4L,
      2L, 2L, 2L, -1L, -1L, -1L, -1L, -1L, 8L, 4L, 1L, 5L, -1L, -1L,
      -1L, -1L, -1L
    ), dim = c(9L, 2L)), children = structure(c(
      1L,
      2L, 3L, 4L, 0L, 2L, 5L, 6L, 8L, 8L, 7L, 6L, 5L, 2L, 5L, 6L, 8L,
      10L
    ), dim = c(9L, 2L)), indices = c(
      2L, 6L, 0L, 5L, 9L, 1L, 4L,
      7L, 3L, 8L
    ), leaf_size = 3), list(normal_indices = structure(c(
      5L,
      6L, -1L, -1L, 4L, -1L, -1L, 8L, 0L, -1L, -1L, 3L, -1L, -1L
    ), dim = c(
      7L,
      2L
    )), children = structure(c(
      1L, 2L, 0L, 2L, 5L, 5L, 8L, 4L,
      3L, 2L, 5L, 6L, 8L, 10L
    ), dim = c(7L, 2L)), indices = c(
      2L, 6L,
      0L, 5L, 9L, 1L, 4L, 7L, 3L, 8L
    ), leaf_size = 3), list(
      normal_indices = structure(c(
        7L,
        -1L, 5L, -1L, -1L, 4L, -1L, 1L, -1L, -1L
      ), dim = c(5L, 2L)),
      children = structure(c(
        1L, 0L, 3L, 3L, 6L, 2L, 3L, 4L, 6L,
        10L
      ), dim = c(5L, 2L)), indices = c(
        3L, 7L, 8L, 0L, 5L, 9L,
        1L, 2L, 4L, 6L
      ), leaf_size = 4
    ), list(normal_indices = structure(c(
      4L,
      5L, -1L, -1L, 6L, -1L, -1L, 1L, 7L, -1L, -1L, 3L, -1L, -1L
    ), dim = c(
      7L,
      2L
    )), children = structure(c(
      1L, 2L, 0L, 2L, 5L, 5L, 8L, 4L,
      3L, 2L, 5L, 6L, 8L, 10L
    ), dim = c(7L, 2L)), indices = c(
      5L, 9L,
      2L, 4L, 7L, 0L, 1L, 6L, 3L, 8L
    ), leaf_size = 3), list(
      normal_indices = structure(c(
        6L,
        7L, -1L, -1L, -1L, 0L, 4L, -1L, -1L, -1L
      ), dim = c(5L, 2L)),
      children = structure(c(
        1L, 2L, 0L, 3L, 7L, 4L, 3L, 3L, 7L,
        10L
      ), dim = c(5L, 2L)), indices = c(
        3L, 7L, 8L, 1L, 2L, 4L,
        6L, 0L, 5L, 9L
      ), leaf_size = 4
    ), list(
      normal_indices = structure(c(
        9L,
        -1L, 3L, -1L, -1L, 4L, -1L, 2L, -1L, -1L
      ), dim = c(5L, 2L)),
      children = structure(c(
        1L, 0L, 3L, 3L, 6L, 2L, 3L, 4L, 6L,
        10L
      ), dim = c(5L, 2L)), indices = c(
        0L, 5L, 9L, 3L, 7L, 8L,
        1L, 2L, 4L, 6L
      ), leaf_size = 4
    )), margin = "implicit", actual_metric = "sqeuclidean",
    version = "0.0.12", use_alt_metric = TRUE, original_metric = "euclidean",
    sparse = FALSE, type = "rnndescent:rpforest"
  )
jlmelville/rnndescent documentation built on April 19, 2024, 8:26 p.m.