tests/testthat/test-kNN.R

library("testthat")
library("dbscan")

context("kNN")

set.seed(665544)
n <- 1000
x <- cbind(
  x = runif(10, 0, 10) + rnorm(n, sd = 0.2),
  y = runif(10, 0, 10) + rnorm(n, sd = 0.2),
  z = runif(10, 0, 10) + rnorm(n, sd = 0.2)
)

## no duplicates first! All distances should be unique
x <- x[!duplicated(x),]

rownames(x) <- paste("Object_", 1:nrow(x), sep="")

k <- 5L
nn <- dbscan::kNN(x, k=k, sort = TRUE)

## check dimensions
expect_equal(nn$k, k)
expect_equal(dim(nn$dist), c(nrow(x), k))
expect_equal(dim(nn$id), c(nrow(x), k))

## check visually
#plot(x)
#points(x[nn$id[1,],], col="red", lwd=5)
#points(x[nn$id[2,],], col="green", lwd=5)

## compare with kNN found using distances
nn_d <- dbscan::kNN(dist(x), k, sort = TRUE)

## check visually
#plot(x)
#points(x[nn_d$id[1,],], col="red", lwd=5)
#points(x[nn_d$id[2,],], col="green", lwd=5)

### will agree since we use sorting
expect_equal(nn, nn_d)

## calculate dist internally
nn_d2 <- dbscan::kNN(x, k, search = "dist", sort = TRUE)
expect_equal(nn, nn_d2)

## without sorting
nn2 <- dbscan::kNN(x, k=k, sort = FALSE)
expect_equal(t(apply(nn$id, MARGIN = 1, sort)),
  t(apply(nn2$id, MARGIN = 1, sort)))

## search options
nn_linear <- dbscan::kNN(x, k=k, search = "linear", sort = TRUE)
expect_equal(nn, nn_linear)

## split options
for(so in c("STD", "MIDPT", "FAIR", "SL_FAIR")) {
  nn3 <- dbscan::kNN(x, k=k, splitRule = so, sort = TRUE)
  expect_equal(nn, nn3)
}

## bucket size
for(bs in c(5, 10, 15, 100)) {
  nn3 <- dbscan::kNN(x, k=k, bucketSize = bs, sort = TRUE)
  expect_equal(nn, nn3)
}

## the order is not stable with matching distances which means that the
## k-NN are not stable. We add 100 copied points to check if self match
## filtering and sort works
x <- rbind(x, x[sample(1:nrow(x), 100),])
rownames(x) <- paste("Object_", 1:nrow(x), sep="")

k <- 5L
nn <- dbscan::kNN(x, k=k, sort = TRUE)

## compare with manually found NNs
nn_d <- dbscan::kNN(x, k=k, search = "dist", sort = TRUE)

expect_equal(nn$dist, nn_d$dist)
## This is expected to fail: because the ids are not stable for matching distances
## expect_equal(nn$id, nn_d$id)
## FIXME: write some code to check this!


## missing values, but distances are fine
x_na <- x
x_na[c(1,3,5), 1] <- NA
expect_error(dbscan::kNN(x_na, k = 3), regexp = "NA")
res_d1 <- dbscan::kNN(x_na, k = 3, search = "dist")
res_d2 <- dbscan::kNN(dist(x_na), k = 3)
expect_equal(res_d1, res_d2)

## introduce NAs into dist
x_na[c(1,3,5),] <- NA
expect_error(dbscan::kNN(x_na, k = 3), regexp = "NA")
expect_error(dbscan::kNN(x_na, k = 3, search = "dist"), regexp = "NA")
expect_error(dbscan::kNN(dist(x_na), k = 3), regexp = "NA")

## inf
x_inf <- x
x_inf[c(1,3,5), 2] <- Inf
dbscan::kNN(x_inf, k = 3)
dbscan::kNN(x_inf, k = 3, search = "dist")
dbscan::kNN(dist(x_inf), k = 3)


## sort and kNN to reduce k
nn10 <- dbscan::kNN(x, k = 10)
#nn10 <- dbscan::kNN(x, k = 10, sort = FALSE)
## knn now returns sorted lists
#expect_equal(nn10$sort, FALSE)
expect_error(dbscan::kNN(nn10, k = 11))
nn5 <- dbscan::kNN(nn10, k = 5)
expect_equal(nn5$sort, TRUE)
expect_equal(ncol(nn5$id), 5L)
expect_equal(ncol(nn5$dist), 5L)

## test with simple data
x <- data.frame(x=1:10, row.names = LETTERS[1:10])
nn <- dbscan::kNN(x, k = 5)
expect_equivalent(nn$id[1,], c(2,3,4,5,6))
expect_equivalent(nn$id[5,], c(4,6,3,7,2))
expect_equivalent(nn$id[10,], c(9,8,7,6,5))

## test kNN with query
x <- data.frame(x=1:10, row.names = LETTERS[1:10])
nn <- dbscan::kNN(x[1:8, , drop=FALSE], x[9:10, , drop = FALSE], k = 5)
expect_equivalent(nrow(nn$id), 2L)
expect_equivalent(nn$id[1,], 8:4)
expect_equivalent(nn$id[2,], 8:4)

expect_error(nn <- dbscan::kNN(dist(x[1:8, , drop=FALSE]), x[9:10, , drop = FALSE], k = 5))

Try the dbscan package in your browser

Any scripts or data that you put into this service are public.

dbscan documentation built on Oct. 29, 2022, 1:13 a.m.