tests/testthat/test_dsm_projection.R

## Validate SVD and other latent subspace projections
## based on small "hieroglyphs" example matrix
library(Matrix)
library(wordspace)

expect_matrix_equal <- function(x, y, tol=1e-10, ignore.sign=FALSE, note="", note1=note, note2=note) {
  name.x <- deparse(substitute(x))
  name.y <- deparse(substitute(y))
  x <- as.matrix(x)
  y <- as.matrix(y)
  expect_equal(dim(x), dim(y), 
               label=sprintf("dim(%s)%s", name.x, note1), 
               expected.label=sprintf("dim(%s)%s", name.y, note2))
  if (ignore.sign) {
    sign.vec <- sapply(seq_len(ncol(x)), function (i) {
      j <- which.max(abs(x[, i]))
      sign(x[j, i]) * sign(y[j, i])
    })
    x <- scaleMargins(x, cols=sign.vec)
  }
  expect_equal(x, y, tolerance=tol, ignore_attr=TRUE,
               label=paste0(name.x, note1), 
               expected.label=paste0(name.y, note2))
}

## compare distances between row vectors (for M1 and M2 in different subspaces)
##  - tol = proportion of average distance (for matrix with larger distances)
dist.compare <- function (M1, M2, method="euclidean", tol=.001, 
                          note="", note1=note, note2=note, expect=TRUE) { 
  name.M1 <- deparse(substitute(M1))
  name.M2 <- deparse(substitute(M2))
  if (expect) expect_true(nrow(M1) == nrow(M2),
                          label=sprintf("nrow(%s)%s == nrow(%s)%s", name.M1, note1, name.M2, note2))
  nr <- nrow(M1)
  M1 <- dist.matrix(M1, method=method)
  M2 <- dist.matrix(M2, method=method)
  n <- nr * (nr - 1)
  sum1 <- sum(M1) / n
  sum2 <- sum(M2) / n
  d <- sqrt(sum((M1 - M2)^2) / n) # root mean squared difference
  rel.error <- d / max(sum1, sum2)
  if (expect) expect_lte(rel.error, !!tol,
                         label=sprintf("relative error for distances in %s%s vs. %s%s", name.M1, note1, name.M2, note2))
  invisible(rel.error)
}

## test data: sparse co-occurrence matrix in sparse and dense representation
O <- DSM_HieroglyphsMatrix
E <- outer(rowSums(O), colSums(O)) / sum(O)
M1 <- scale((O - E) / sqrt(E), center=TRUE, scale=FALSE)  # z-score (signed), centered (so SVD projection == PCA)
M2 <- Matrix(M1, sparse=TRUE) # force sparse matrix representation to test sparse algorithms
expect_s4_class(M2, "dgCMatrix")


## full-rank SVD decomposition
svd.R <- svd(M1, nu=6, nv=6)
proj.R <- svd.R$u %*% diag(svd.R$d) # projection into SVD space
approx.R <- proj.R %*% t(svd.R$v)   # matrix approximation

proj.ws <- dsm.projection(M1, method="svd", n=6, with.basis=TRUE)
approx.ws <- proj.ws %*% t(attr(proj.ws, "basis"))

test_that("full-rank SVD matches original matrix", {
  dist.compare(proj.R, M1)
  dist.compare(approx.R, M1)
  expect_matrix_equal(approx.R, M1, tol=1e-6)
  
  dist.compare(proj.ws, M1)
  dist.compare(approx.ws, M1)
  expect_matrix_equal(approx.ws, M2, tol=1e-6)
})


## low-rank SVD projection
svd.R <- svd(M1, nu=3, nv=3)
proj.R <- svd.R$u %*% diag(svd.R$d[1:3]) # projection into SVD space
proj.ws <- dsm.projection(M1, method="svd", n=3)
proj.ws.sparse <- dsm.projection(M2, method="svd", n=3)

test_that("low-rank SVD projection is reasonable approximation", {
  dist.compare(proj.R, proj.ws)
  dist.compare(proj.R, M1, tol=.2) # relative error for SVD approximation should be ca. 10% 
  dist.compare(proj.ws, M1, tol=.2)
  dist.compare(proj.ws.sparse, M2, tol=.2)
  dist.compare(proj.ws, proj.ws.sparse)
  
  expect_matrix_equal(proj.R, proj.ws, tol=1e-6, ignore.sign=TRUE) # must ignore arbitrary sign of SVD dims
  expect_matrix_equal(proj.R, proj.ws.sparse, tol=1e-6, ignore.sign=TRUE)
})


## verify power scaling for dense and sparse SVD
test_that("SVD power scaling works as expected", {
  proj.ws.p2 <- dsm.projection(M1, method="svd", n=3, power=2)        # dense
  expect_matrix_equal(proj.ws.p2, svd.R$u %*% diag(svd.R$d[1:3]^2), tol=1e-6, ignore.sign=TRUE)
  proj.ws.sparse.p2 <- dsm.projection(M2, method="svd", n=3, power=2) # sparse
  expect_matrix_equal(proj.ws.sparse.p2, proj.ws.p2, tol=1e-6, ignore.sign=TRUE)
  
  expect_matrix_equal(proj.ws.p2, scaleMargins(proj.ws, cols=attr(proj.ws, "sigma")), tol=1e-6, ignore.sign=TRUE) # post-hoc power scaling
  expect_matrix_equal(proj.ws.sparse.p2, scaleMargins(proj.ws.sparse, cols=attr(proj.ws.sparse, "sigma")), tol=1e-6, ignore.sign=TRUE)
  
  proj.ws.p0 <- dsm.projection(M1, method="svd", n=3, power=0)       # whitening
  expect_matrix_equal(proj.ws.p0, svd.R$u, tol=1e-6, ignore.sign=TRUE)
})  


## full-rank randomized SVD
test_that("full-rank rSVD is equivalent to SVD", {
  set.seed(42) # for predictable results
  rsvd.proj.ws <- dsm.projection(M1, method="rsvd", n=6, oversampling=2, with.basis=TRUE)
  rsvd.approx.ws <- rsvd.proj.ws %*% t(attr(rsvd.proj.ws, "basis"))
  set.seed(42)
  rsvd.proj.ws.sparse <- dsm.projection(M2, method="rsvd", n=6, oversampling=2, with.basis=TRUE)
  rsvd.approx.ws.sparse <- rsvd.proj.ws.sparse %*% t(attr(rsvd.proj.ws.sparse, "basis"))
  
  dist.compare(rsvd.proj.ws, M1)
  expect_matrix_equal(rsvd.approx.ws, M1, tol=1e-6)
  dist.compare(rsvd.proj.ws.sparse, M2)
  expect_matrix_equal(rsvd.approx.ws.sparse, M2, tol=1e-6)
})  


## randomized SVD projection (with intermediate full rank)
test_that("rSVD with full-rank oversampling is equivalent to SVD", {
  set.seed(42)
  rsvd.proj.ws <- dsm.projection(M1, method="rsvd", n=3, oversampling=3)
  set.seed(42)
  rsvd.proj.ws.sparse <- dsm.projection(M2, method="rsvd", n=3, oversampling=3)
  
  dist.compare(rsvd.proj.ws, M1, tol=.2)
  dist.compare(rsvd.proj.ws, proj.ws)
  dist.compare(rsvd.proj.ws.sparse, M2, tol=.2)
  dist.compare(rsvd.proj.ws.sparse, proj.ws.sparse)
})


## randomized SVD projection (with reduced intermediate rank)
test_that("rSVD with reduced oversampling approximates to SVD", {
  set.seed(42)
  rsvd.proj.ws <- dsm.projection(M1, method="rsvd", n=2, oversampling=2)
  set.seed(42)
  rsvd.proj.ws.sparse <- dsm.projection(M2, method="rsvd", n=2, oversampling=2)

  dist.compare(rsvd.proj.ws, M1, tol=.35) # relative error should be ca. 30%
  dist.compare(rsvd.proj.ws.sparse, M2, tol=.35)
  dist.compare(rsvd.proj.ws, rsvd.proj.ws.sparse) # should be identical
})


## random indexing (with completely filled random vectors)
relerr.dense <- sapply(1:100, function (k) {
  set.seed(k)
  ri.proj.ws <- dsm.projection(M1, method="ri", n=3, rate=1, verbose=FALSE)
  dist.compare(ri.proj.ws, M1, expect=FALSE)
})
relerr.sparse <- sapply(1:100, function (k) {
  set.seed(k)
  ri.proj.ws <- dsm.projection(M2, method="ri", n=3, rate=1, verbose=FALSE)
  dist.compare(ri.proj.ws, M2, expect=FALSE)
})

res <- t.test(relerr.dense, relerr.sparse)
if (FALSE) {
  cat(sprintf("\nRelative error for dense RI (n=3, wordspace):  %4.1f +/- %4.1f %%\n", 
              100 * mean(relerr.dense), 100 * sd(relerr.dense)))
  cat(sprintf("Relative error for sparse RI (n=3, wordspace): %4.1f +/- %4.1f %%\n", 
              100 * mean(relerr.sparse), 100 * sd(relerr.sparse)))
  cat(sprintf("T-test for difference in means:  p = %.3f\n", res$p.value))
}
  
test_that("Random Indexing produces reasonable approximations", {
  expect_lt(min(relerr.dense), .3)
  expect_lt(median(relerr.dense), .5)
  expect_lt(min(relerr.sparse), .3)
  expect_lt(median(relerr.sparse), .5)
  expect_gt(res$p.value, .01) # with completely filled random vectors, both methods should have similar performance 
})

Try the wordspace package in your browser

Any scripts or data that you put into this service are public.

wordspace documentation built on Sept. 9, 2022, 3:04 p.m.