tests/testthat/test-read_keywords.R

require(quanteda)
data(keyATM_data_bills)
bills_dfm <- keyATM_data_bills$doc_dfm
keyATM_docs <- keyATM_read(bills_dfm)
bills_keywords <- keyATM_data_bills$keywords

test_that("normal usage", {
  bill_dictionary <- quanteda::dictionary(bills_keywords)
  x <- read_keywords(dictionary = bill_dictionary, docs = keyATM_docs)
  expect_equal(x$Drug, "drug")
  ## defensive programming
  expect_error(read_keywords())
  expect_warning(read_keywords(dictionary = bill_dictionary))
})

test_that("resolving and split", {
  ## a simplfied example from koheiw/quanteda.seededlda
  corp <- corpus(c(
    "air force 1",
    "soldier soldiers navy army politician",
    "party members",
    "parliament is politicians' party politics leaders leadership"
  ))
  toks <- tokens(corp, remove_punct = TRUE)
  dfmt <- dfm(toks) %>%
    dfm_select("^[A-Za-z]+$", valuetype = "regex") %>%
    dfm_remove(stopwords('en'))
  docs <- keyATM_read(texts = dfmt)
  smalldictionary <- dictionary(list(
    politics = c(
      "parliament*",
      "congress*",
      "party leader*",
      "party member*",
      "voter*",
      "lawmaker*",
      "politician*"
    ),
    military = c(
      "military",
      "soldier*",
      "air force",
      "marine",
      "navy",
      "army"
    )
  ))
  ## default: split = TRUE
  x <- read_keywords(dictionary = smalldictionary, docs = docs)
  expect_true("soldiers" %in% x$military)
  expect_true("air" %in% x$military)
  y <- read_keywords(dictionary = smalldictionary, docs = docs, split = FALSE)
  expect_true(!"air" %in% y$military)
})

test_that("file I/O", {
  ## the example of the function
  skip_on_cran()
  skip_on_os("linux")
  dictfile <- tempfile()
  download.file(
    "https://moralfoundations.org/wp-content/uploads/files/downloads/moral%20foundations%20dictionary.dic",
    dictfile,
    quiet = TRUE
  )
  ## first position is file.
  x <- read_keywords(dictfile, docs = keyATM_docs, format = "LIWC")
  expect_true("terrorism" %in% x$IngroupVice)
})

test_that("Integration", {
  skip_on_cran()
  skip_on_os("linux")
  ## exact example: koheiw/quanteda.seededlda
  rdsfile <- tempfile()
  download.file(
    "https://github.com/koheiw/quanteda.seededlda/raw/master/tests/data/data_corpus_sputnik.RDS",
    rdsfile,
    quiet = TRUE
  )

  corp <- readRDS(rdsfile)
  toks <- tokens(corp, remove_punct = TRUE)
  dfmt <- dfm(toks) %>%
    dfm_select("^[A-Za-z]+$", valuetype = "regex") %>%
    dfm_remove(stopwords('en')) %>%
    dfm_trim(
      min_termfreq = 0.90,
      termfreq_type = "quantile",
      max_docfreq = 0.1,
      docfreq_type = "prop"
    )
  docs <- keyATM_read(texts = dfmt)
  dictfile <- tempfile(fileext = ".yml")
  download.file(
    "https://raw.githubusercontent.com/koheiw/quanteda.seededlda/master/tests/data/topics.yml",
    dictfile,
    quiet = TRUE
  )
  kw <- read_keywords(dictfile, docs = docs)
  res <- keyATM(
    docs,
    no_keyword_topics = 0,
    keywords = kw,
    model = "base",
    options = list(seed = 1234, iterations = 20)
  )
  ## syria is related to military
  expect_true(
    "syria" %in%
      top_words(res, n = 20, show_keyword = FALSE)[, "5_military"]
  )
  ## German is not related to military
  expect_true(
    !"german" %in%
      top_words(res, n = 20, show_keyword = FALSE)[, "5_military"]
  )
})

Try the keyATM package in your browser

Any scripts or data that you put into this service are public.

keyATM documentation built on Aug. 8, 2025, 6:14 p.m.