tests/testthat/test_knn.R

# tests for universal functions (umap_universal.R)

source("synthetic.R")
source("train_test.R")

# ############################################################################
# Tests for exact nearest neighbors extraction

test_that("k nearest neighbors complains when k is too large or too small", {
  # da is a distance matrix (not necessarily metric)
  da <- matrix(0, ncol=5, nrow=5)
  expect_error(knn.from.dist(da, 6))
  expect_error(knn.from.dist(da, 0))
  expect_silent(knn.from.dist(da, 3))
})

test_that("k nearest neighbors information", {
  # da is a distance matrix (not necessarily metric)
  da <- matrix(0, ncol=5, nrow=5)
  # object 1 - nearest items are 2,3 
  da[1,2] <- da[2,1] <- 1
  da[1,3] <- da[3,1] <- 2
  da[1,4] <- da[4,1] <- 3
  da[1,5] <- da[5,1] <- 4
  # object 2 - nearest items are 1,4
  da[2,3] <- da[3,2] <- 10
  da[2,4] <- da[4,2] <- 2
  da[2,5] <- da[5,2] <- 11
  # object 3 - nearest items are 1,4
  da[3,4] <- da[4,3] <- 3
  da[3,5] <- da[5,3] <- 12
  # object 4 - nearest items are 5,2
  da[4,5] <- da[5,4] <- 1
  da.dist <- as.dist(da)
  
  # compute knn from matrix and from dist
  result <- knn.from.dist(da, 3)
  resultd <- knn.from.dist(da.dist, 3)
  # the results should be numerically equal, but (dist) introduces rownames
  # check for numerical equality only (not attributes)
  expect_equal(result, resultd, check.attributes=FALSE)

  # check content of the matrices
  # these matrices constructed manually by visual inspection of da
  expected.indexes <- matrix(c(1,2,3, 2,1,4, 3,1,4, 4,5,2, 5,4,1),
                            byrow=TRUE, nrow=5, ncol=3)
  expected.distances <- matrix(c(0,1,2, 0,1,2, 0,2,3, 0,1,2, 0,1,4),
                              byrow=TRUE, nrow=5, ncol=3)
  expect_equal(result$indexes, expected.indexes)
  expect_equal(result$distances, expected.distances)
})

# ############################################################################
# Tests for well-formed output

test_that("knn.from.data should preserve rownames", {
  result <- knn.from.data(t(syn0), 3, mdEuclidean)
  expect_is(result, "umap.knn")
  expect_equal(rownames(syn0), rownames(result$indexes))
  expect_equal(rownames(syn0), rownames(result$distances))
})

test_that("knn.from.data should preserve rownames", {
  syn0dist <- as.matrix(syn0.dist)
  result <- knn.from.dist(syn0dist, 3)
  expect_is(result, "umap.knn")
  expect_equal(rownames(syn0), rownames(result$indexes))
  expect_equal(rownames(syn0), rownames(result$distances))  
})

test_that("knn.info (brute force) should preserve rownames", {
  conf <- umap.defaults
  conf$n_neighbors <- 4
  conf$metric.function <- mdEuclidean
  result <- knn.info(syn0, conf, brute.force=TRUE)
  expected.rownames <- c(rownames(result$indexes), rownames(result$distances))
  result.rownames <- c(rownames(syn0), rownames(syn0))
  expect_equal(result.rownames, expected.rownames)
})

test_that("knn.info (from data) should preserve rownames", {
  conf <- umap.defaults
  conf$n_neighbors <- 4
  conf$metric.function <- mdEuclidean
  result <- knn.info(syn0, conf, brute.force=FALSE)
  expected.rownames <- c(rownames(result$indexes), rownames(result$distances))
  result.rownames <- c(rownames(syn0), rownames(syn0))
  expect_is(result, "umap.knn")
  expect_equal(result.rownames, expected.rownames)
})

# ############################################################################
# Tests for approximate nearest neighbors extraction

test_that("knn from data complains when k is too large or too small", {
  # da is a dummy matrix
  da <- matrix(0, ncol=5, nrow=5)
  expect_error(knn.from.data(da, 6, mdEuclidean))
  expect_error(knn.from.data(da, 0, mdEuclidean))
  expect_silent(knn.from.data(da, 3, mdEuclidean))
})

test_that("knn from data complains about k (large dataset)", {
  # da is a dummy, but large, matrix 
  da <- matrix(0, ncol=5, nrow=5000)
  expect_error(knn.from.data(da, 6000, mdEuclidean))
  expect_error(knn.from.data(da, 0, mdEuclidean))
  expect_error(knn.from.data(da, -0.5, mdEuclidean))
})

test_that("knn from data complains about subsampling (large dataset)", {
  # da is a dummy, but large, matrix 
  da <- matrix(0, nrow=20, ncol=300)
  expect_error(knn.from.data(da, 5, mdEuclidean, subsample.k=NA),
               "subsample")
  expect_error(knn.from.data(da, 5, mdEuclidean, subsample.k=-0.2),
               "subsample")
  expect_error(knn.from.data(da, 5, mdEuclidean, subsample.k=600),
               "subsample")
  expect_silent(knn.from.data(da, 6, mdEuclidean, subsample.k=0.3))
})

test_that("knn returns a reasonable set of data", {

  # dd is a matrix with points on a plane
  dd <- matrix(0, ncol=2, nrow=10)
  # cluster 1 with points near (1,1)
  dd[1,] <- c(1, 1)
  dd[2,] <- c(1, 2)
  dd[3,] <- c(2, 1)
  # cluster with points near (10, 10)
  dd[4,] <- c(10,10)
  dd[5,] <- c(10,11)
  dd[6,] <- c(12,12)
  # cluster with points near (20, 20)
  dd[7,] <- c(20,20)
  dd[8,] <- c(20,21)
  dd[9,] <- c(21,20)
  dd[10,] <- c(21,21)
  
  result <- knn.from.data(t(dd), 3, mdEuclidean)
  expect_equal(dim(result$indexes), c(10, 3))
  expect_equal(dim(result$distances), c(10, 3))
})

test_that("knn for large dataset queries a small number of distances", {
  dlarge <- matrix(0, ncol=2, nrow=300)
  dlarge[,1] <- runif(nrow(dlarge), -2, 2)
  dlarge[,2] <- runif(nrow(dlarge), -2, 2)
  result.dist <- knn.from.dist(dist(dlarge), 4)
  result <- knn.from.data.reps(dlarge, 4, mdEuclidean, subsample=4, reps=1)
  result.reps <- knn.from.data.reps(dlarge, 4, mdEuclidean, subsample=4, reps=2)
  
  # all self distances are zero
  expect_equal(sum(result$distances[,1]), 0)
  # distance to nearest neighbor should be small
  expect_lt(mean(result$distances[,2]), 0.4)
  # distances in two-reps are smaller
  totdist1 <- sum(apply(result$distances, 1, sum))
  totdist2 <- sum(apply(result.reps$distances, 2, sum))
  expect_lt(totdist2, totdist1)
})

# ############################################################################
# Tests with degenerate neighbors

test_that("knn works with degenerate neighbors", {
  # syn1 has so many same-location points, that first nearest neighbor
  # should always be at distance 0
  result <- knn.from.data(t(syn1[, 1:2]), 4, mdEuclidean)
  expect_lt(mean(result$distances[,2]), 0.01)
})

# ############################################################################
# Tests with spectators

test_that("knn from data links spectators to primary data", {
  # create merged dataset with observations in columns
  dT <- cbind(t(i.train), t(i.test))
  num.primary <- nrow(i.train)
  result <- knn.from.data(dT, 6, mdEuclidean, fix.observations=num.primary)
  # result should have indexes and distance components
  expect_true(all(c("indexes", "distances") %in% names(result)))
  # indexes should show all observation link to themselves
  expect_equivalent(result$indexes[,1], seq_len(nrow(result$indexes)))
  # indexes to further neighbors should only link to primary data
  expect_lt(max(result$indexes[, 2:6]), num.primary+1)
})

train.size <- nrow(i.train)/3
test.size <- nrow(i.test)/3

test_that("spectator.info by brute force links spectators to primary data", {
  conf <- umap.prep.config(umap.defaults, random_state=123)
  conf$n_neighbors <- 5
  result <- spectator.knn.info(i.test, i.train, conf, brute.force=TRUE)
  # check output sizes
  expect_equal(dim(result$indexes), c(nrow(i.test), conf$n_neighbors))
  expect_equal(dim(result$distances), c(nrow(i.test), conf$n_neighbors))
  expect_equal(rownames(result$distances), rownames(i.test))
  expect_equal(rownames(result$indexes), rownames(i.test))
  # ditances of first neighbor must be larger than zero
  expect_equal(mean(result$distances[,1]), 0)
  # entries in i.test should connect to items in i4,
  # which are more numerous than i.test
  indexes <- result$indexes[, 2:ncol(result$indexes)]
  expect_gt(max(indexes), nrow(i.test))
  # the first 5 items in i.test must be neighbors to the
  # first 15 indexes in i4, etc.
  expect_lt(mean(indexes[1:test.size,]), train.size)
  expect_lt(mean(indexes[test.size+(1:test.size),]), 2*train.size)
  expect_gt(mean(indexes[2*test.size+(1:test.size),]), 2*train.size)
})

test_that("spectator.info by stochastic method links to primary data", {
  conf <- umap.prep.config(umap.defaults, random_state=123)
  conf$n_neighbors <- 5
  result <- spectator.knn.info(i.test, i.train, conf, brute.force=FALSE)
  # check output sizes
  expect_equal(dim(result$indexes), c(nrow(i.test), conf$n_neighbors))
  expect_equal(dim(result$distances), c(nrow(i.test), conf$n_neighbors))
  expect_equal(rownames(result$distances), rownames(i.test))
  expect_equal(rownames(result$indexes), rownames(i.test))
  # ditances of first neighbor must be larger than zero
  expect_equal(mean(result$distances[,1]), 0)
  # entries in i.test should connect to items in i4,
  # which are more numerous than i.test
  indexes <- result$indexes[, 2:ncol(result$indexes)]
  expect_gt(max(indexes), nrow(i.test))
  # the first 5 items in i.test must be neighbors to the
  # first 15 indexes in i4, etc.
  expect_lt(mean(indexes[1:test.size,]), train.size)
  expect_lt(mean(indexes[test.size+(1:test.size),]), 2*train.size)
  expect_gt(mean(indexes[2*test.size+(1:test.size),]), 2*train.size)
})

Try the umap package in your browser

Any scripts or data that you put into this service are public.

umap documentation built on Feb. 16, 2023, 10:12 p.m.