## tests for universal functions (umap_universal.R)
cat("\ntest_knn\n")
source("synthetic.R")
source("train_test.R")
## ############################################################################
## Tests for exact nearest neighbors extraction
test_that("k nearest neighbors complains when k is too large or too small", {
## da is a distance matrix (not necessarily metric)
da = matrix(0, ncol=5, nrow=5)
expect_error(knn.from.dist(da, 6))
expect_error(knn.from.dist(da, 0))
expect_silent(knn.from.dist(da, 3))
})
test_that("k nearest neighbors information", {
## da is a distance matrix (not necessarily metric)
da = matrix(0, ncol=5, nrow=5)
## object 1 - nearest items are 2,3
da[1,2] = da[2,1] = 1
da[1,3] = da[3,1] = 2
da[1,4] = da[4,1] = 3
da[1,5] = da[5,1] = 4
## object 2 - nearest items are 1,4
da[2,3] = da[3,2] = 10
da[2,4] = da[4,2] = 2
da[2,5] = da[5,2] = 11
## object 3 - nearest items are 1,4
da[3,4] = da[4,3] = 3
da[3,5] = da[5,3] = 12
## object 4 - nearest items are 5,2
da[4,5] = da[5,4] = 1
da.dist = as.dist(da)
## compute knn from matrix and from dist
result = knn.from.dist(da, 3)
resultd = knn.from.dist(da.dist, 3)
## the results should be numerically equal, but (dist) introduces rownames
## check for numerical equality only (not attributes)
expect_equal(result, resultd, check.attributes=FALSE)
## check content of the matrices
## these matrices constructed manually by visual inspection of da
expected.indexes = matrix(c(1,2,3, 2,1,4, 3,1,4, 4,5,2, 5,4,1),
byrow=TRUE, nrow=5, ncol=3)
expected.distances = matrix(c(0,1,2, 0,1,2, 0,2,3, 0,1,2, 0,1,4),
byrow=TRUE, nrow=5, ncol=3)
expect_equal(result$indexes, expected.indexes)
expect_equal(result$distances, expected.distances)
})
## ############################################################################
## Tests for well-formed output
test_that("knn.from.data should preserve rownames", {
result = knn.from.data(t(syn0), 3, mdEuclidean)
expect_equal(rownames(syn0), rownames(result$indexes))
expect_equal(rownames(syn0), rownames(result$distances))
})
test_that("knn.from.data should preserve rownames", {
syn0dist = as.matrix(syn0.dist)
result = knn.from.dist(syn0dist, 3)
expect_equal(rownames(syn0), rownames(result$indexes))
expect_equal(rownames(syn0), rownames(result$distances))
})
test_that("knn.info (brute force) should preserve rownames", {
conf = umap.defaults
conf$n_neighbors = 4
conf$metric.function = mdEuclidean
result = knn.info(syn0, conf, brute.force=TRUE)
expected.rownames = c(rownames(result$indexes), rownames(result$distances))
result.rownames = c(rownames(syn0), rownames(syn0))
expect_equal(result.rownames, expected.rownames)
})
test_that("knn.info (from data) should preserve rownames", {
conf = umap.defaults
conf$n_neighbors = 4
conf$metric.function = mdEuclidean
result = knn.info(syn0, conf, brute.force=FALSE)
expected.rownames = c(rownames(result$indexes), rownames(result$distances))
result.rownames = c(rownames(syn0), rownames(syn0))
expect_equal(result.rownames, expected.rownames)
})
## ############################################################################
## Tests for approximate nearest neighbors extraction
test_that("knn from data complains when k is too large or too small", {
## da is a dummy matrix
da = matrix(0, ncol=5, nrow=5)
expect_error(knn.from.data(da, 6, mdEuclidean))
expect_error(knn.from.data(da, 0, mdEuclidean))
expect_silent(knn.from.data(da, 3, mdEuclidean))
})
test_that("knn from data complains about k (large dataset)", {
## da is a dummy, but large, matrix
da = matrix(0, ncol=5, nrow=5000)
expect_error(knn.from.data(da, 6000, mdEuclidean))
expect_error(knn.from.data(da, 0, mdEuclidean))
expect_error(knn.from.data(da, -0.5, mdEuclidean))
})
test_that("knn from data complains about subsampling (large dataset)", {
## da is a dummy, but large, matrix
da = matrix(0, nrow=20, ncol=300)
expect_error(knn.from.data(da, 5, mdEuclidean, subsample.k=NA), "subsample")
expect_error(knn.from.data(da, 5, mdEuclidean, subsample.k=-0.2), "subsample")
expect_error(knn.from.data(da, 5, mdEuclidean, subsample.k=600), "subsample")
expect_silent(knn.from.data(da, 6, mdEuclidean, subsample.k=0.3))
})
test_that("knn returns a reasonable set of data", {
## dd is a matrix with points on a plane
dd = matrix(0, ncol=2, nrow=10)
## cluster 1 with points near (1,1)
dd[1,] = c(1, 1)
dd[2,] = c(1, 2)
dd[3,] = c(2, 1)
## cluster with points near (10, 10)
dd[4,] = c(10,10)
dd[5,] = c(10,11)
dd[6,] = c(12,12)
## cluster with points near (20, 20)
dd[7,] = c(20,20)
dd[8,] = c(20,21)
dd[9,] = c(21,20)
dd[10,] = c(21,21)
result = knn.from.data(t(dd), 3, mdEuclidean)
expect_equal(dim(result$indexes), c(10, 3))
expect_equal(dim(result$distances), c(10, 3))
})
test_that("knn for large dataset queries a small number of distances", {
dlarge = matrix(0, ncol=2, nrow=300)
dlarge[,1] = runif(nrow(dlarge), -2, 2)
dlarge[,2] = runif(nrow(dlarge), -2, 2)
result.dist = knn.from.dist(dist(dlarge), 4)
result = knn.from.data.reps(dlarge, 4, mdEuclidean, subsample=4, reps=1)
result.reps = knn.from.data.reps(dlarge, 4, mdEuclidean, subsample=4, reps=2)
## all self distances are zero
expect_equal(sum(result$distances[,1]), 0)
## distance to nearest neighbor should be small
expect_lt(mean(result$distances[,2]), 0.4)
## distances in two-reps are smaller
totdist1 = sum(apply(result$distances, 1, sum))
totdist2 = sum(apply(result.reps$distances, 2, sum))
expect_lt(totdist2, totdist1)
})
## ############################################################################
## Tests with degenerate neighbors
test_that("knn works with degenerate neighbors", {
## syn1 has so many same-location points, that first nearest neighbor
## should always be at distance 0
result = knn.from.data(t(syn1[, 1:2]), 4, mdEuclidean)
expect_lt(mean(result$distances[,2]), 0.01)
})
## ############################################################################
## Tests with spectators
test_that("knn from data links spectators to primary data", {
## create merged dataset with observations in columns
dT = cbind(t(i.train), t(i.test))
num.primary = nrow(i.train)
result = knn.from.data(dT, 6, mdEuclidean, fix.observations=num.primary)
## result should have indexes and distance components
expect_true(all(c("indexes", "distances") %in% names(result)))
## indexes should show all observation link to themselves
expect_equivalent(result$indexes[,1], 1:nrow(result$indexes))
## indexes to further neighbors should only link to primary data
expect_lt(max(result$indexes[, 2:6]), num.primary+1)
})
train.size = nrow(i.train)/3
test.size = nrow(i.test)/3
test_that("spectator.info by brute force links spectators to primary data", {
conf = umap.check.config(umap.defaults)
conf$n_neighbors = 5
result = spectator.knn.info(i.test, i.train, conf, brute.force=TRUE)
## check output sizes
expect_equal(dim(result$indexes), c(nrow(i.test), conf$n_neighbors))
expect_equal(dim(result$distances), c(nrow(i.test), conf$n_neighbors))
expect_equal(rownames(result$distances), rownames(i.test))
expect_equal(rownames(result$indexes), rownames(i.test))
## ditances of first neighbor must be larger than zero
expect_equal(mean(result$distances[,1]), 0)
## entries in i.test should connect to items in i4, which are more numerous than i.test
indexes = result$indexes[, 2:ncol(result$indexes)]
expect_gt(max(indexes), nrow(i.test))
## the first 5 items in i.test must be neighbors to the first 15 indexes in i4, etc.
expect_lt(mean(indexes[1:test.size,]), train.size)
expect_lt(mean(indexes[test.size+(1:test.size),]), 2*train.size)
expect_gt(mean(indexes[2*test.size+(1:test.size),]), 2*train.size)
})
test_that("spectator.info by stochastic method links spectators to primary data", {
conf = umap.check.config(umap.defaults)
conf$n_neighbors = 5
result = spectator.knn.info(i.test, i.train, conf, brute.force=FALSE)
## check output sizes
expect_equal(dim(result$indexes), c(nrow(i.test), conf$n_neighbors))
expect_equal(dim(result$distances), c(nrow(i.test), conf$n_neighbors))
expect_equal(rownames(result$distances), rownames(i.test))
expect_equal(rownames(result$indexes), rownames(i.test))
## ditances of first neighbor must be larger than zero
expect_equal(mean(result$distances[,1]), 0)
## entries in i.test should connect to items in i4, which are more numerous than i.test
indexes = result$indexes[, 2:ncol(result$indexes)]
expect_gt(max(indexes), nrow(i.test))
## the first 5 items in i.test must be neighbors to the first 15 indexes in i4, etc.
expect_lt(mean(indexes[1:test.size,]), train.size)
expect_lt(mean(indexes[test.size+(1:test.size),]), 2*train.size)
expect_gt(mean(indexes[2*test.size+(1:test.size),]), 2*train.size)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.