tests/testthat/test_mallet-functionality.R

context("mallet-functionality")

data(sotu)

test_that(desc="getVocabulary",{
  skip_on_cran()

  sotu.instances <-
    mallet.import(id.array = row.names(sotu),
                  text.array = sotu[["text"]],
                  stoplist = mallet_stoplist_file_path("en"),
                  token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
  topic.model <- MalletLDA(num.topics=10, alpha.sum = 1, beta = 0.1)
  topic.model$loadDocuments(sotu.instances)
  topic.model$train(20)

  expect_silent(
    vocabulary <- topic.model$getVocabulary()
  )
  expect_equal(vocabulary[1:3], c("congress", "united", "states"))
})


test_that(desc="mallet.word.freqs",{
  skip_on_cran()

  sotu.instances <-
    mallet.import(id.array = row.names(sotu),
                  text.array = sotu[["text"]],
                  stoplist = mallet_stoplist_file_path("en"),
                  token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
  topic.model <- MalletLDA(num.topics=10, alpha.sum = 1, beta = 0.1)
  topic.model$loadDocuments(sotu.instances)
  topic.model$train(20)

  expect_silent(
    word.freqs <- mallet.word.freqs(topic.model)
  )
  expect_equal(as.character(word.freqs[1:3,1]), c("congress", "united", "states"))
  expect_equal(word.freqs[1:3,2], c(1025, 508, 557))
  expect_equal(word.freqs[1:3,3], c(879, 426, 480))
  expect_true(all(word.freqs[,3] <= word.freqs[,2]))
})


test_that(desc="setAlphaOptimization",{
  skip_on_cran()

  sotu.instances <-
    mallet.import(id.array = row.names(sotu),
                  text.array = sotu[["text"]],
                  stoplist = mallet_stoplist_file_path("en"),
                  token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
  topic.model <- MalletLDA(num.topics=10, alpha.sum = 1, beta = 0.1)
  topic.model$loadDocuments(sotu.instances)
  topic.model$train(20)

  expect_silent(
    topic.model$setAlphaOptimization(20, 50)
  )
})

test_that(desc="train optimized",{
  skip_on_cran()

  sotu.instances <-
    mallet.import(id.array = row.names(sotu),
                  text.array = sotu[["text"]],
                  stoplist = mallet_stoplist_file_path("en"),
                  token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
  topic.model <- MalletLDA(num.topics=10, alpha.sum = 1, beta = 0.1)
  topic.model$loadDocuments(sotu.instances)
  topic.model$train(20)

  expect_silent(
    topic.model$train(100)
  )
})


test_that(desc="maximize",{
  skip_on_cran()

  sotu.instances <-
    mallet.import(id.array = row.names(sotu),
                  text.array = sotu[["text"]],
                  stoplist = mallet_stoplist_file_path("en"),
                  token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
  topic.model <- MalletLDA(num.topics=10, alpha.sum = 1, beta = 0.1)
  topic.model$loadDocuments(sotu.instances)
  topic.model$train(20)

  expect_silent(
    topic.model$maximize(10)
  )
})


test_that(desc="Get parameter matrices",{
  skip_on_cran()

  sotu.instances <-
    mallet.import(id.array = row.names(sotu),
                  text.array = sotu[["text"]],
                  stoplist = mallet_stoplist_file_path("en"),
                  token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
  topic.model <- MalletLDA(num.topics=10, alpha.sum = 1, beta = 0.1)
  topic.model$loadDocuments(sotu.instances)
  topic.model$train(20)

  expect_silent(
    doc.topics <- mallet.doc.topics(topic.model, smoothed=TRUE, normalized=TRUE)
  )

  expect_equal(dim(doc.topics), c(6816,10))
  expect_equal(object = rowSums(doc.topics), expected = rep(1,6816), tolerance = .00000000001, scale = 1)

  expect_silent(
    topic.words <- mallet.topic.words(topic.model, smoothed=TRUE, normalized=TRUE)
  )

  expect_equal(dim(topic.words), c(10, 13634))
  expect_equal(object = rowSums(topic.words), expected = rep(1,10), tolerance = .00000000001, scale = 1)

})

test_that(desc="mallet.top.words and mallet.topic.labels",{
  skip_on_cran()

  sotu.instances <-
    mallet.import(id.array = row.names(sotu),
                  text.array = sotu[["text"]],
                  stoplist = mallet_stoplist_file_path("en"),
                  token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
  topic.model <- MalletLDA(num.topics=10, alpha.sum = 1, beta = 0.1)
  topic.model$loadDocuments(sotu.instances)
  topic.model$train(20)

  expect_silent(
    top.words <- mallet.top.words(topic.model, word.weights = mallet.topic.words(topic.model, smoothed=TRUE, normalized=TRUE)[2,], num.top.words = 5)
  )
  expect_equal(dim(top.words), c(5,2))
  expect_lt(object = sum(top.words$weight), expected = 1)

  checkmate::expect_character(mallet.topic.labels(topic.model), min.chars = 6, any.missing = FALSE, len = 10)
})


test_that(desc="mallet.subset",{
  skip_on_cran()

  sotu.instances <-
    mallet.import(id.array = row.names(sotu),
                  text.array = sotu[["text"]],
                  stoplist = mallet_stoplist_file_path("en"),
                  token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
  topic.model <- MalletLDA(num.topics=10, alpha.sum = 1, beta = 0.1)
  topic.model$loadDocuments(sotu.instances)
  topic.model$train(20)

  modern_times <- sotu$year > 1975
  expect_silent({
    modern.topic.words <- mallet.subset.topic.words(topic.model,
                                                 subset.docs = modern_times,
                                                 smoothed=TRUE,
                                                 normalized=TRUE)
    not.modern.topic.words <- mallet.subset.topic.words(topic.model,
                                                   subset.docs = !modern_times,
                                                   smoothed=TRUE,
                                                   normalized=TRUE)
  })

  expect_equal(dim(modern.topic.words), c(10, 13634))
  expect_equal(dim(not.modern.topic.words), c(10, 13634))
  expect_true(any(modern.topic.words != not.modern.topic.words))
})




test_that(desc="mallet_jar",{
  skip_on_cran()

  expect_failure(expect_equal(mallet_jar(), "rmallet-202108.jar"))
  expect_equal(mallet_jar(), "rmallet-20220712.jar")
})

Try the mallet package in your browser

Any scripts or data that you put into this service are public.

mallet documentation built on July 20, 2022, 5:08 p.m.