tests/testthat/test-tidy-methods.R

context("tests of tidy methods for tidylda")

dtm <- nih_sample_dtm

d1 <- dtm[1:50, ]

d2 <- dtm[51:100, ]

# make sure we have different vocabulary for each data set
d1 <- d1[, Matrix::colSums(d1) > 0]

d2 <- d2[, Matrix::colSums(d2) > 0]

lda <- tidylda(
  data =  d1,
  k = 4,
  iterations = 20, burnin = 10,
  alpha = 0.1, eta = 0.05,
  optimize_alpha = TRUE,
  calc_likelihood = TRUE,
  calc_r2 = TRUE,
  return_data = FALSE,
  verbose = FALSE
)

### test tidy methods ----

test_that("tidy.tidylda works as expected", {
  
  # tidy beta
  tidy_beta <- tidy(
    x = lda,
    matrix = "beta"
  )
  
  expect_named(tidy_beta, c("topic", "token", "beta"))
  
  expect_type(tidy_beta[[1]], "double")
  
  expect_type(tidy_beta[[2]], "character")
  
  expect_type(tidy_beta[[3]], "double")
  
  # log to tidy beta
  tidy_beta_log <- tidy(
    x = lda,
    matrix = "beta",
    log = TRUE
  )
  
  expect_named(tidy_beta_log, c("topic", "token", "log_beta"))
  
  expect_type(tidy_beta_log[[1]], "double")
  
  expect_type(tidy_beta_log[[2]], "character")
  
  expect_type(tidy_beta_log[[3]], "double")
  
  expect_equal(sum(colnames(lda$beta) %in% tidy_beta$token), length(colnames(lda$beta)))
  
  # tidy theta
  tidy_theta <- tidy(
    x = lda,
    matrix = "theta"
  )
  
  expect_named(tidy_theta, c("document", "topic", "theta"))
  
  expect_type(tidy_theta[[1]], "character")
  
  expect_type(tidy_theta[[2]], "double")
  
  expect_type(tidy_theta[[3]], "double")
  
  expect_equal(sum(colnames(lda$theta) %in% tidy_theta$topic), length(colnames(lda$theta)))
  
  # tidy lambda
  tidy_lambda <- tidy(
    x = lda,
    matrix = "lambda"
  )
  
  expect_named(tidy_lambda, c("topic", "token", "lambda"))
  
  expect_type(tidy_lambda[[1]], "double")
  
  expect_type(tidy_lambda[[2]], "character")
  
  expect_type(tidy_lambda[[3]], "double")
  
  expect_equal(sum(colnames(lda$lambda) %in% tidy_lambda$token), length(colnames(lda$lambda)))
  
})

test_that("tidy throws errors for malformed inputs", {
  expect_error(
    tidy(
      x = lda,
      matrix = "WRONG"
    )
  )
  
  expect_error(
    tidy(
      x = lda,
      matrix = 1
    )
  )
  
  expect_error(
    tidy(
      x = lda,
      matrix = "beta",
      log = "WRONG"
    )
  )
  
  # matrices the same as above
  expect_error(
    tidy_theta <- tidy(
      x = lda$theta,
      matrix = "WRONG"
    )
  )
  
  expect_error(
    tidy_theta <- tidy(
      x = lda$theta,
      matrix = "theta",
      log = "WRONG"
    )
  )
  
})

### tests for the glance method ----

test_that("glance.tidylda behaves nicely", {
  
  # well-formed call
  g <- glance(lda)
  
  expect_named(g, c(
    "num_topics", "num_documents", "num_tokens",
    "iterations", "burnin"
  ))
  
  expect_equal(g$num_topics, nrow(lda$beta))
  
  expect_equal(g$num_documents, nrow(lda$theta))
  
  expect_equal(g$num_tokens, ncol(lda$beta))
  
  expect_equal(g$iterations, lda$call$iterations)
  
  expect_equal(g$burnin, lda$call$burnin)

})

test_that("glance works with updated models", {
  l2 <- refit(lda, d2, iterations = 20, verbose = FALSE)
  
  g <- glance(l2)
  
  expect_named(g, c(
    "num_topics", "num_documents", "num_tokens",
    "iterations", "burnin"
  ))
  
  expect_equal(g$num_topics, nrow(l2$beta))
  
  expect_equal(g$num_documents, nrow(l2$theta))
  
  expect_equal(g$num_tokens, ncol(l2$beta))
  
  expect_equal(g$iterations, l2$call$iterations)
  
  expect_equal(g$burnin, NA)
})

### tests for the glance method ----

# make a tidy tibble
# note this uses unigrams and bigrams to ensure that there isn't 
# 100% overlap in vocabulary between data and model
tidy_docs <- 
  nih_sample[1:10, ] %>% 
  dplyr::select(APPLICATION_ID, ABSTRACT_TEXT) %>% 
  tidytext::unnest_tokens(output = word, 
                          input = ABSTRACT_TEXT,
                          stopwords = tidytext::stop_words$word,
                          token = "ngrams",
                          n_min = 1, n = 2) 

test_that("augment.tidylda behaves nicely", {
  
  td <- tidy_docs

  # tidy tibble input
  a <- augment(
    x = lda, 
    data = td,
    type = "class",
    document_col = colnames(td)[1],
    term_col = colnames(td)[2]
  )
  
  # right number of rows?
  expect_equal(nrow(a), nrow(tidy_docs))
  
  # correctly identified topics?
  expect_equal(
    a %>% 
      dplyr::filter(.[[colnames(td)[2]]] %in% colnames(lda$beta)) %>%
      dplyr::select(topic) %>%
      dplyr::summarise(na_topic = sum(is.na(topic))) %>%
      as.numeric,
    0,
    label = "augment is.na(topic class)"
  )
  
  # sparse dtm input
  
  a <- augment(
    x = lda,
    data = d1,
    type = "prob"
  )
  
  expect_false(
    a %>% 
      dplyr::filter(term %in% colnames(lda$beta)) %>%
      dplyr::select(-c(document, term)) %>%
      colSums() %>%
      sum %>% 
      is.na,
    label = "augment is.na(topic probs)"
  )
  
  # probably should test other dtm types...
  
})


test_that("augment.tidylda reacts to malformed inputs correctly", {
  
  expect_error(
    augment(
      x = "wat",
      data = d1,
      type = "prob"
    )
  )
  
  expect_error(
    augment(
      x = lda,
      data = "wat",
      type = "prob"
    )
  )
  
  expect_error(
    augment(
      x = lda,
      data = d1,
      type = "wat"
    )
  )
  
})

Try the tidylda package in your browser

Any scripts or data that you put into this service are public.

tidylda documentation built on July 26, 2023, 5:34 p.m.