tests/testthat/test_RollingLDA.R

context("RollingLDA")

data("economy_texts")
data("economy_dates")

roll_proto = RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                        init = 20, K = 5, n = 10)
roll_lda = RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                      init = 20, K = 5, type = "lda")

roll_proto2 = RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                         init = 100, K = 10, alpha = 1, eta = 1, num.iterations = 20,
                         n = 10, id = "myid", seeds = 1:10)
roll_lda2 = RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                       init = 100, K = 10, alpha = 1, eta = 1, num.iterations = 20,
                       id = "myid", type = "lda")

docs = getDocs(roll_proto2, names = getNames(roll_proto2)[1:getChunks(roll_proto2)$n[1]])
vocab = getVocab(roll_proto2)[1:getChunks(roll_proto2)$n.vocab[1]]
dates = getDates(roll_proto2)[1:getChunks(roll_proto2)$n[1]]
proto2 = LDAPrototype(docs, vocab, n = 10, seeds = 1:10,
                      K = 10, alpha = 1, eta = 1, num.iterations = 20)

init_proto = as.RollingLDA(lda = getLDA(proto2), dates = dates, docs = docs, id = "myid")
seed = Sys.time()
set.seed(seed)
init_proto_updated1 = RollingLDA(init_proto,
                                 economy_texts[!names(economy_texts) %in% names(docs)],
                                 economy_dates[!names(economy_dates) %in% names(docs)],
                                 "quarter", "6 month")
set.seed(seed)
init_proto_updated2 = updateRollingLDA(init_proto,
                                       economy_texts[!names(economy_texts) %in% names(docs)],
                                       economy_dates[!names(economy_dates) %in% names(docs)],
                                       "quarter", "6 month")
init_proto_updated3 = updateRollingLDA(init_proto,
                                       economy_texts[!names(economy_texts) %in% names(docs)],
                                       economy_dates[!names(economy_dates) %in% names(docs)],
                                       "quarter", "6 month")

test_that("is.RollingLDA", {
  expect_true(is.RollingLDA(roll_lda))
  expect_true(is.RollingLDA(roll_lda, verbose = TRUE))
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "id")
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "lda")
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "docs")
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "dates")
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "vocab")
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "chunks")
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "param")
  tmp = roll_lda
  ind = sample(getChunks(tmp)$chunk.id, 1)
  tmp$chunks$n[ind+1] = tmp$chunks$n[ind+1] + 1L
  expect_false(is.RollingLDA(tmp))
  expect_false(is.RollingLDA(tmp, verbose = TRUE))
  expect_message(is.RollingLDA(tmp, verbose = TRUE),
                 "sum of \"n\" does not match number of texts")
  expect_true(is.RollingLDA(roll_proto))
  expect_true(is.RollingLDA(roll_lda2))
  expect_true(is.RollingLDA(roll_proto2))
  expect_true(is.RollingLDA(init_proto))
  expect_true(is.RollingLDA(init_proto_updated1))
  expect_true(is.RollingLDA(init_proto_updated2))
  expect_true(is.RollingLDA(init_proto_updated3))

  # no docs left after preproc
  expect_error(RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                          init = 20, K = 5, doc.abs = 600, type = "lda"),
               "after preprocessing there are no texts left for the initial model")
  expect_error(RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                          init = 20, K = 5, doc.abs = 60, type = "lda"),
               "after preprocessing there is only one text left for the initial model")
  expect_error(RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                      init = 20, K = 5, vocab.abs = 40, type = "lda"),
               "Assertion on 'vocab' failed")
  expect_error(RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                          init = 20, K = 5, vocab.abs = 41, type = "lda"),
               "after preprocessing there are no texts left for the initial model")
})

test_that("is.LDA", {
  expect_true(is.LDA(getLDA(roll_lda)))
  expect_true(is.LDA(getLDA(roll_lda), verbose = TRUE))
  expect_true(is.LDA(getLDA(roll_proto)))
  expect_true(is.LDA(getLDA(roll_lda2)))
  expect_true(is.LDA(getLDA(roll_proto2)))
  expect_true(is.LDA(getLDA(init_proto)))
  expect_true(is.LDA(getLDA(init_proto_updated1)))
  expect_true(is.LDA(getLDA(init_proto_updated2)))
  expect_true(is.LDA(getLDA(init_proto_updated3)))
})

test_that("chunks: match statistics with expactations", {
  expect_equal(getChunks(roll_lda),
               data.table(
                 chunk.id = 0:8L,
                 start.date = as.Date(c("2007-01-01",
                                        "2007-01-21",
                                        "2007-04-20",
                                        "2007-07-20",
                                        "2007-10-20",
                                        "2008-01-21",
                                        "2008-04-23",
                                        "2008-07-21",
                                        "2008-10-20")),
                 end.date = as.Date(c("2007-01-19",
                                      "2007-04-18",
                                      "2007-07-19",
                                      "2007-10-17",
                                      "2008-01-19",
                                      "2008-04-17",
                                      "2008-07-17",
                                      "2008-10-18",
                                      "2008-12-29")),
                 memory = as.Date(c(NA,
                                    "2006-07-20",
                                    "2006-10-20",
                                    "2007-01-20",
                                    "2007-04-20",
                                    "2007-07-20",
                                    "2007-10-20",
                                    "2008-01-20",
                                    "2008-04-20")),
                 n = as.integer(c(21, 84, 92, 85, 65, 67, 65, 60, 33)),
                 n.discarded = as.integer(c(0, 0, 0, 0, 2, 1, 1, 0, 0)),
                 n.memory = as.integer(c(NA, 21, 105, 176, 177, 150, 132, 132, 125)),
                 n.vocab = as.integer(c(55, 547, 841, 1038, 1160, 1261, 1323, 1486, 1503)),
                 key = colnames(getChunks(roll_lda))))
  expect_equal(getChunks(roll_lda)[, 1:2],
               data.table(
                 chunk.id = 0:8L,
                 start.date = as.Date(c("2007-01-01",
                                        "2007-01-21",
                                        "2007-04-20",
                                        "2007-07-20",
                                        "2007-10-20",
                                        "2008-01-21",
                                        "2008-04-23",
                                        "2008-07-21",
                                        "2008-10-20")), key = c("chunk.id", "start.date")))
})

test_that("types: lda, ldaprototype", {
  # roll_proto and roll_lda should only differ in lda (assignments ...) and id
  expect_identical(
    roll_proto[!names(roll_proto) %in% c("id", "lda")],
    roll_lda[!names(roll_lda) %in% c("id", "lda")])

  # roll_proto2 and roll_lda2 should only differ in lda (assignments ...)
  expect_identical(
    roll_proto2[names(roll_proto2) != "lda"],
    roll_lda2[names(roll_lda2) != "lda"])
})

test_that("unnamed dates", {
  seed = Sys.time()
  set.seed(seed)
  model1 = RollingLDA(economy_texts[1:200], unname(economy_dates[1:200]), "quarter", "6 month",
                      init = 20, K = 5, type = "lda")
  set.seed(seed)
  model2 = RollingLDA(economy_texts[1:200], economy_dates[1:200], "quarter", "6 month",
                      init = 20, K = 5, type = "lda")
  expect_identical(model1, model2)

  # keep in mind: dates must not be sorted, but are considered to be in the same
  # order as texts
  set.seed(seed)
  model2 = RollingLDA(economy_texts[1:200], sort(unname(economy_dates[1:200])), "quarter", "6 month",
                      init = 20, K = 5, type = "lda")
  expect_error(expect_identical(model1, model2))
})

test_that("init: date, ineger, character(date)", {
  chunk = unname(sort(economy_dates)[sample(180:220, 1)])
  init = unname(sort(economy_dates)[sample(80:120, 1)])
  mid = unname(sort(economy_dates)[sample(145:155, 1)])

  seed = Sys.time()

  # chunks as character identical to Date
  set.seed(seed)
  model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = init, memory = 20, K = 5, type = "lda")
  set.seed(seed)
  model2 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = as.character(init), memory = 20, K = 5, type = "lda")
  expect_identical(model1, model2)


  set.seed(seed)
  model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = c(init, mid), memory = 20, K = 5, type = "lda", vocab.abs = 0)
  set.seed(seed)
  model2 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = 20, K = 5, type = "lda", vocab.abs = 0)
  model2 = RollingLDA(model2, economy_texts[economy_dates >= mid & economy_dates <= chunk],
                      economy_dates[economy_dates >= mid & economy_dates <= chunk], memory = 20)
  expect_identical(getChunks(model1), getChunks(model2))
  expect_setequal(getVocab(model1), getVocab(model2))
  expect_mapequal(getDates(model1), getDates(model2))

  # needed if chunks character
  expect_error(RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                          chunks = "month", memory = 20, K = 5, type = "lda", vocab.abs = 0),
               "\"init\"")

  # not used if chunks dates
  set.seed(seed)
  model1 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = 20, K = 5, type = "lda", init = -12)
  set.seed(seed)
  model2 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = 20, K = 5, type = "lda", init = 0)
  set.seed(seed)
  model3 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = 20, K = 5, type = "lda", init = "999")
  expect_identical(model1, model2)
  expect_identical(model1, model3)

  # error: init smaller than minimum of dates
  expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                          chunks = min(economy_dates), memory = min(economy_dates) - 1, K = 5, type = "lda"),
               "lowest date")
})

test_that("memory: character, date, integer, character(date)", {
  chunk = unname(sort(economy_dates)[sample(180:220, 1)])
  init = unname(sort(economy_dates)[sample(80:120, 1)])
  mid = unname(sort(economy_dates)[sample(145:155, 1)])
  seed = Sys.time()

  # month = 1 month
  set.seed(seed)
  model1 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = "month", K = 5, type = "lda")
  set.seed(seed)
  model2 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = "1 month", K = 5, type = "lda")
  set.seed(seed)
  # tolower works
  model3 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = "1 mOnTh", K = 5, type = "lda")
  set.seed(seed)
  # date(s) = character dates
  memory1 = seq.Date(from = init, by = "-1 month", length.out = 2)[2]
  model4 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = memory1, K = 5, type = "lda")
  set.seed(seed)
  # character-date
  model5 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = as.character(memory1), K = 5, type = "lda")
  expect_identical(model1, model2)
  expect_identical(model1, model3)
  expect_identical(model1, model4)
  expect_identical(model1, model5)
  seed = Sys.time()
  set.seed(seed)
  model1 = updateRollingLDA(model1,
                            economy_texts[economy_dates >= mid & economy_dates <= chunk],
                            economy_dates[economy_dates >= mid & economy_dates <= chunk],
                            memory = "month")
  set.seed(seed)
  model2 = updateRollingLDA(model2,
                            economy_texts[economy_dates >= mid & economy_dates <= chunk],
                            economy_dates[economy_dates >= mid & economy_dates <= chunk],
                            memory = "1 month")
  expect_identical(model1, model2)

  # memory unsorted
  expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                          chunks = c(init, mid), memory = c(memory1, memory1-1), K = 5, type = "lda"),
               "\"memory\" must be sorted")

  # memory > chunks
  expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                          chunks = c(init, mid), memory = c(memory1, mid+1), K = 5, type = "lda"),
               "\"memory\" must not be greater than the pendant in \"chunks\"")

  # memory rep is ok
  model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = c(init, mid), memory = c(memory1, memory1), K = 5, type = "lda")

  # memory empty -> warning
  expect_warning(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                            chunks = c(init, mid), memory = c(memory1, mid), K = 5, type = "lda"),
                 "there are no texts as memory for this chunk - skip chunk")

  # memory.fallback -> warning
  expect_warning(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                            chunks = c(init, mid), memory = c(memory1, mid), K = 5, type = "lda", memory.fallback = 5),
                 "there are no texts as memory for this chunk - using \"memory.fallback\"")

  # 1month -> error
  expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                          chunks = init, memory = "1month", K = 5, type = "lda"))
  # memory vector
  expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                          chunks = c(init, mid), memory = c(memory1, "12"), K = 5, type = "lda"))

  ## updateRollingLDA ##
  # not RollingLDA object
  expect_error(updateRollingLDA(economy_texts[economy_dates >= mid & economy_dates <= chunk],
                                economy_dates[economy_dates >= mid & economy_dates <= chunk],
                                chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                                memory = 4), "not a RollingLDA object")

  # memory missing
  expect_error(updateRollingLDA(model3,
                                economy_texts[economy_dates >= mid & economy_dates <= chunk],
                                economy_dates[economy_dates >= mid & economy_dates <= chunk],
                                chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk])))

  # should be fine:
  seed = Sys.time()
  set.seed(seed)
  model4 = updateRollingLDA(model3,
                            economy_texts[economy_dates >= mid & economy_dates <= chunk],
                            economy_dates[economy_dates >= mid & economy_dates <= chunk],
                            chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                            memory = 4)
  set.seed(seed)
  model5 = updateRollingLDA(model3,
                            economy_texts[economy_dates >= mid & economy_dates <= chunk],
                            unname(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                            chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                            memory = 4)
  expect_identical(model4, model5)
  # should be fine:
  model4 = updateRollingLDA(model3,
                            economy_texts[economy_dates >= mid & economy_dates <= chunk],
                            economy_dates[economy_dates >= mid & economy_dates <= chunk],
                            chunks = as.character(range(economy_dates[economy_dates >= mid & economy_dates <= chunk])),
                            memory = range(economy_dates[economy_dates >= mid & economy_dates <= chunk])-10)
  # memory unsorted
  expect_error(updateRollingLDA(model3,
                                economy_texts[economy_dates >= mid & economy_dates <= chunk],
                                economy_dates[economy_dates >= mid & economy_dates <= chunk],
                                chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                                memory = c(min(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                                           min(economy_dates[economy_dates >= mid & economy_dates <= chunk])-1)),
               "\"memory\" must be sorted")
  # memory > chunks
  expect_error(updateRollingLDA(model3,
                                economy_texts[economy_dates >= mid & economy_dates <= chunk],
                                economy_dates[economy_dates >= mid & economy_dates <= chunk],
                                chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                                memory = range(economy_dates[economy_dates >= mid & economy_dates <= chunk])+1),
               "\"memory\" must not be greater than the pendant in \"chunks\"")
  # chunks unsorted
  expect_error(updateRollingLDA(model3,
                                economy_texts[economy_dates >= mid & economy_dates <= chunk],
                                economy_dates[economy_dates >= mid & economy_dates <= chunk],
                                chunks = rev(range(economy_dates[economy_dates >= mid & economy_dates <= chunk])),
                                memory = 4), "\"chunks\" must be sorted")
})

test_that("chunks: character, date, character(date)", {
  # TODO

  chunk = unname(sort(economy_dates)[sample(180:220, 1)])
  init = unname(sort(economy_dates)[sample(80:120, 1)])
  memory1 = seq.Date(from = init, by = "-1 month", length.out = 2)[2]
  seed = Sys.time()

  # chunks as character identical to Date
  set.seed(seed)
  model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = init, memory = 20, K = 5, type = "lda")
  set.seed(seed)
  model2 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = as.character(init), memory = 20, K = 5, type = "lda")
  expect_identical(model1, model2)

  # date(s)

  # character chunks = date(s)
  chunks = seq.Date(init+1, max(economy_dates[economy_dates <= chunk]), "quarter")

  set.seed(seed)
  model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = chunks, memory = 10, init = init, K = 5, type = "lda")
  set.seed(seed)
  model2 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = "quarter", memory = 10, init = init, K = 5, type = "lda")
  expect_identical(model1, model2)

  # chunks unsorted
  expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                          chunks = c(init, init-1), memory = c(memory1, memory1), K = 5, type = "lda"),
               "\"chunks\" must be sorted")
  # chunks rep -> empty chunk -> warning
  expect_warning(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                            chunks = c(init, init), memory = c(memory1, memory1), K = 5, type = "lda"),
                 "there are no texts in this chunk - skip chunk")
})


test_that("staged setting: updateRollingLDA", {
  # init_proto_updated1 and 2 are identical, but 3 is different (different seed)!
  expect_identical(init_proto_updated1, init_proto_updated2)
  expect_error(expect_identical(init_proto_updated1, init_proto_updated3))
  # roll_proto2 and init_proto_updated1 should only differ in lda and
  # first element of n.discarded in chunks table
  expect_identical(
    roll_proto2[!names(roll_proto2) %in% c("lda", "chunks")],
    init_proto_updated1[!names(init_proto_updated1) %in% c("lda", "chunks")])
  # initialized RollingLDA with unknown discarded texts in chunk 0
  expect_equal(
    getChunks(init_proto_updated1)$n.discarded[1],
    NA_integer_)
  # rest of chunks should be equal
  expect_identical(
    getChunks(roll_proto2)[-1,],
    getChunks(init_proto_updated1)[-1,])
})

test_that("Parameter RollingLDA match", {
  expect_identical(getParam(roll_proto),
                   getParam(roll_lda))
  expect_identical(getParam(roll_proto2),
                   getParam(roll_lda))
  expect_identical(getParam(roll_lda2),
                   getParam(roll_lda))
  expect_identical(getParam(init_proto_updated1),
                   getParam(roll_lda))
  expect_identical(rollinglda:::.defaultParam(),
                   getParam(roll_lda))

})

test_that("Parameter LDA match", {
  expect_identical(getParam(getLDA(roll_proto)),
                   getParam(getLDA(roll_lda)))
  expect_equal(getK(getLDA(roll_proto)), 5)
  expect_equal(getAlpha(getLDA(roll_proto)), 1/5)
  expect_equal(getEta(getLDA(roll_proto)), 1/5)
  expect_equal(getNum.iterations(getLDA(roll_proto)), 200)

  expect_equal(getK(getLDA(roll_proto2)), 10)
  expect_equal(getAlpha(getLDA(roll_proto2)), 1)
  expect_equal(getEta(getLDA(roll_proto2)), 1)
  expect_equal(getNum.iterations(getLDA(roll_proto2)), 20)
  expect_identical(getParam(getLDA(roll_proto2)),
                   getParam(getLDA(roll_lda2)))
  expect_identical(getParam(getLDA(roll_proto2)),
                   getParam(getLDA(roll_lda2)))
})

test_that("print.RollingLDA", {
  # prints not equal
  expect_error(expect_output(print(roll_lda), capture.output(print(roll_lda2))))
  # RollingLDA Object
  expect_output(print(roll_lda), "RollingLDA Object")
  expect_output(print(roll_lda2), "RollingLDA Object")

  # LDA print is part of RollingLDA print
  for(i in capture.output(print(getLDA(roll_lda)))[1:5]){
    expect_output(print(roll_lda), i, fixed = TRUE)
  }
  for(i in capture.output(print(getLDA(roll_lda2)))[1:5]){
    expect_output(print(roll_lda2), i, fixed = TRUE)
  }
})

Try the rollinglda package in your browser

Any scripts or data that you put into this service are public.

rollinglda documentation built on Oct. 1, 2022, 9:06 a.m.