tests/testthat/test-refit.tidylda.R

context("tests of refit method for tidylda")

dtm <- nih_sample_dtm

d1 <- dtm[1:50, ]

d2 <- dtm[51:100, ]

# make sure we have different vocabulary for each data set
d1 <- d1[, Matrix::colSums(d1) > 0]

d2 <- d2[, Matrix::colSums(d2) > 0]

lda <- tidylda(
  data = d1,
  k = 4,
  iterations = 20, burnin = 10,
  alpha = 0.1, eta = 0.05,
  optimize_alpha = TRUE,
  calc_likelihood = TRUE,
  calc_r2 = TRUE,
  return_data = FALSE,
  verbose = FALSE
)



### Tests for refit.tidylda ----

test_that("can update models", {
  # continuation of the old model for another 20 iterations
  # matters because dtm lines up exactly with existing vocabulary etc.
  lda2 <- refit(
    object = lda,
    new_data = d1,
    iterations = 20,
    burnin = 10,
    prior_weight = NA,
    additional_k = 0,
    optimize_alpha = TRUE,
    calc_likelihood = TRUE,
    calc_r2 = TRUE,
    return_data = FALSE,
    verbose = FALSE
  )
  
  expect_named(lda2, names(lda))
  
  expect_equal(nrow(lda2$theta), nrow(d1))
  
  expect_equal(ncol(lda2$theta), ncol(lda$theta))
  
  expect_equal(sum(dim(lda2$beta) == dim(lda$beta)), 2)
  
  expect_equal(sum(dim(lda2$lambda) == dim(lda$lambda)), 2)
  
  
  # new data adding no extra topics no beta as prior
  lda2 <- refit(
    object = lda,
    new_data = d2,
    additional_k = 0,
    prior_weight = NA,
    iterations = 20,
    burnin = 10,
    optimize_alpha = TRUE,
    calc_likelihood = TRUE,
    calc_r2 = TRUE,
    return_data = FALSE,
    verbose = FALSE
  )
  
  expect_named(lda2, names(lda))
  
  expect_equal(nrow(lda2$theta), nrow(d2))
  
  expect_equal(ncol(lda2$theta), ncol(lda$theta))
  
  expect_equal(ncol(lda2$beta), length(union(colnames(d1), colnames(d2))))
  
  # 1 additonal topic and no beta as prior
  lda2 <- refit(
    object = lda,
    new_data = d2,
    additional_k = 1,
    prior_weight = NA,
    iterations = 20,
    burnin = 10,
    optimize_alpha = TRUE,
    calc_likelihood = TRUE,
    calc_r2 = TRUE,
    return_data = FALSE,
    verbose = FALSE
  )
  
  expect_named(lda2, names(lda))
  
  expect_equal(nrow(lda2$theta), nrow(d2))
  
  expect_equal(ncol(lda2$theta), ncol(lda$theta) + 1)
  
  expect_equal(ncol(lda2$beta), length(union(colnames(d1), colnames(d2))))
  
  # 3 additional topics and no beta as prior
  lda2 <- refit(
    object = lda,
    new_data = d2,
    additional_k = 3,
    prior_weight = NA,
    iterations = 20,
    burnin = 10,
    optimize_alpha = TRUE,
    calc_likelihood = TRUE,
    calc_r2 = TRUE,
    return_data = FALSE,
    verbose = FALSE
  )
  
  expect_named(lda2, names(lda))
  
  expect_equal(nrow(lda2$theta), nrow(d2))
  
  expect_equal(ncol(lda2$theta), ncol(lda$theta) + 3)
  
  expect_equal(ncol(lda2$beta), length(union(colnames(d1), colnames(d2))))
  
  # no additional topics and beta as prior
  lda2 <- refit(
    object = lda,
    new_data = d2,
    additional_k = 0,
    prior_weight = 1,
    iterations = 20,
    burnin = 10,
    optimize_alpha = TRUE,
    calc_likelihood = TRUE,
    calc_r2 = TRUE,
    return_data = FALSE,
    verbose = FALSE
  )
  
  expect_named(lda2, names(lda))
  
  expect_equal(nrow(lda2$theta), nrow(d2))
  
  expect_equal(ncol(lda2$theta), ncol(lda$theta))
  
  expect_equal(ncol(lda2$beta), length(union(colnames(d1), colnames(d2))))
  
  expect_true(inherits(lda2$eta, "matrix"))
  
  # 3 additonal topics and beta as prior
  lda2 <- refit(
    object = lda,
    new_data = d2,
    additional_k = 3,
    prior_weight = 1,
    iterations = 20,
    burnin = 10,
    optimize_alpha = TRUE,
    calc_likelihood = TRUE,
    calc_r2 = TRUE,
    return_data = FALSE,
    verbose = FALSE
  )
  
  expect_named(lda2, names(lda))
  
  expect_equal(nrow(lda2$theta), nrow(d2))
  
  expect_equal(ncol(lda2$theta), ncol(lda$theta) + 3)
  
  expect_equal(ncol(lda2$beta), length(union(colnames(d1), colnames(d2))))
  
  # update models with scalar eta
  
  # update models with matrix eta
  l1 <- tidylda(
    data = d1,
    k = 4,
    iterations = 20, burnin = 10,
    alpha = 0.1,
    eta = matrix(0.05, nrow = 4, ncol = ncol(d1)),
    optimize_alpha = TRUE,
    calc_likelihood = TRUE,
    calc_r2 = TRUE,
    return_data = FALSE,
    verbose = FALSE
  )
  
  l2 <- refit(l1, d2, iterations = 20, verbose = FALSE)
  
  expect_equal(ncol(l2$eta), length(union(colnames(d1), colnames(d2))))
})

test_that("errors are thrown for malformed inputs to refit.tidylda", {
  
  # no vocabulary overlap between models
  nd <- rbind(numeric(10), numeric(10), numeric(10))
  
  colnames(nd) <- 1:10 # numbers means no vocab overlap
  
  lda2 <- refit(
    object = lda, 
    new_data = nd,
    iterations = 10,
    verbose = FALSE
  )
  
  expect_s3_class(lda2, "tidylda")
  
  # data doesn't have column names
  d3 <- d2
  colnames(d3) <- NULL
  expect_error(
    refit(
      object = lda,
      new_data = d3,
      iterations = 20,
      verbose = FALSE
    )
  )
  
  
  # iterations not specified
  expect_error(
    refit(
      object = lda,
      new_data = d2
    )
  )
  
  # burnin >= iterations
  expect_error(
    refit(
      object = lda,
      new_data = d2,
      additional_k = 3,
      beta_as_prior = TRUE,
      iterations = 20,
      burnin = 30,
      optimize_alpha = TRUE,
      calc_likelihood = TRUE,
      calc_r2 = TRUE,
      return_data = FALSE
    )
  )
  
  # additional_k is not numeric
  expect_error(
    refit(
      object = lda,
      new_data = d2,
      additional_k = "3",
      beta_as_prior = TRUE,
      iterations = 20,
      burnin = 10,
      optimize_alpha = TRUE,
      calc_likelihood = TRUE,
      calc_r2 = TRUE,
      return_data = FALSE
    )
  )
  
  # additional_k is less than zero
  expect_error(
    refit(
      object = lda,
      new_data = d2,
      additional_k = -3,
      beta_as_prior = TRUE,
      iterations = 20,
      burnin = 10,
      optimize_alpha = TRUE,
      calc_likelihood = TRUE,
      calc_r2 = TRUE,
      return_data = FALSE
    )
  )
  
  # iterations not specified
  expect_error(
    refit(
      object = lda,
      new_data = d2,
      additional_k = 3,
      beta_as_prior = TRUE
    )
  )
  
  # malformed prior weight
  expect_error(
    refit(
      object = lda,
      new_data = d2,
      additional_k = 3,
      prior_weight = TRUE,
      iterations = 20,
      burnin = 10,
      optimize_alpha = TRUE,
      calc_likelihood = TRUE,
      calc_r2 = TRUE,
      return_data = FALSE
    )
  )
  
  # logical things aren't logical
  expect_error(
    refit(
      object = lda,
      new_data = d2,
      additional_k = 3,
      prior_weight = 1,
      iterations = 20,
      burnin = 10,
      optimize_alpha = "TRUE",
      calc_likelihood = TRUE,
      calc_r2 = TRUE,
      return_data = FALSE
    )
  )
  
  expect_error(
    refit(
      object = lda,
      new_data = d2,
      additional_k = 3,
      beta_as_prior = TRUE,
      iterations = 20,
      burnin = 10,
      optimize_alpha = TRUE,
      calc_likelihood = "TRUE",
      calc_r2 = TRUE,
      return_data = FALSE
    )
  )
  expect_error(
    refit(
      object = lda,
      new_data = d2,
      additional_k = 3,
      beta_as_prior = TRUE,
      iterations = 20,
      burnin = 10,
      optimize_alpha = TRUE,
      calc_likelihood = TRUE,
      calc_r2 = "TRUE",
      return_data = FALSE
    )
  )
  
  expect_error(
    refit(
      object = lda,
      new_data = d2,
      additional_k = 3,
      beta_as_prior = TRUE,
      iterations = 20,
      burnin = 10,
      optimize_alpha = TRUE,
      calc_likelihood = TRUE,
      calc_r2 = TRUE,
      return_data = "FALSE"
    )
  )
})

Try the tidylda package in your browser

Any scripts or data that you put into this service are public.

tidylda documentation built on July 26, 2023, 5:34 p.m.