test_RollingLDA.R
In rollinglda: Construct Consistent Time Series from Textual Data

context("RollingLDA")

data("economy_texts")
data("economy_dates")

roll_proto = RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                        init = 20, K = 5, n = 10)
roll_lda = RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                      init = 20, K = 5, type = "lda")

roll_proto2 = RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                         init = 100, K = 10, alpha = 1, eta = 1, num.iterations = 20,
                         n = 10, id = "myid", seeds = 1:10)
roll_lda2 = RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                       init = 100, K = 10, alpha = 1, eta = 1, num.iterations = 20,
                       id = "myid", type = "lda")

docs = getDocs(roll_proto2, names = getNames(roll_proto2)[1:getChunks(roll_proto2)$n[1]])
vocab = getVocab(roll_proto2)[1:getChunks(roll_proto2)$n.vocab[1]]
dates = getDates(roll_proto2)[1:getChunks(roll_proto2)$n[1]]
proto2 = LDAPrototype(docs, vocab, n = 10, seeds = 1:10,
                      K = 10, alpha = 1, eta = 1, num.iterations = 20)

init_proto = as.RollingLDA(lda = getLDA(proto2), dates = dates, docs = docs, id = "myid")
seed = Sys.time()
set.seed(seed)
init_proto_updated1 = RollingLDA(init_proto,
                                 economy_texts[!names(economy_texts) %in% names(docs)],
                                 economy_dates[!names(economy_dates) %in% names(docs)],
                                 "quarter", "6 month")
set.seed(seed)
init_proto_updated2 = updateRollingLDA(init_proto,
                                       economy_texts[!names(economy_texts) %in% names(docs)],
                                       economy_dates[!names(economy_dates) %in% names(docs)],
                                       "quarter", "6 month")
init_proto_updated3 = updateRollingLDA(init_proto,
                                       economy_texts[!names(economy_texts) %in% names(docs)],
                                       economy_dates[!names(economy_dates) %in% names(docs)],
                                       "quarter", "6 month")

test_that("is.RollingLDA", {
  expect_true(is.RollingLDA(roll_lda))
  expect_true(is.RollingLDA(roll_lda, verbose = TRUE))
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "id")
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "lda")
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "docs")
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "dates")
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "vocab")
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "chunks")
  expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "param")
  tmp = roll_lda
  ind = sample(getChunks(tmp)$chunk.id, 1)
  tmp$chunks$n[ind+1] = tmp$chunks$n[ind+1] + 1L
  expect_false(is.RollingLDA(tmp))
  expect_false(is.RollingLDA(tmp, verbose = TRUE))
  expect_message(is.RollingLDA(tmp, verbose = TRUE),
                 "sum of \"n\" does not match number of texts")
  expect_true(is.RollingLDA(roll_proto))
  expect_true(is.RollingLDA(roll_lda2))
  expect_true(is.RollingLDA(roll_proto2))
  expect_true(is.RollingLDA(init_proto))
  expect_true(is.RollingLDA(init_proto_updated1))
  expect_true(is.RollingLDA(init_proto_updated2))
  expect_true(is.RollingLDA(init_proto_updated3))

  # no docs left after preproc
  expect_error(RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                          init = 20, K = 5, doc.abs = 600, type = "lda"),
               "after preprocessing there are no texts left for the initial model")
  expect_error(RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                          init = 20, K = 5, doc.abs = 60, type = "lda"),
               "after preprocessing there is only one text left for the initial model")
  expect_error(RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                      init = 20, K = 5, vocab.abs = 40, type = "lda"),
               "Assertion on 'vocab' failed")
  expect_error(RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
                          init = 20, K = 5, vocab.abs = 41, type = "lda"),
               "after preprocessing there are no texts left for the initial model")
})

test_that("is.LDA", {
  expect_true(is.LDA(getLDA(roll_lda)))
  expect_true(is.LDA(getLDA(roll_lda), verbose = TRUE))
  expect_true(is.LDA(getLDA(roll_proto)))
  expect_true(is.LDA(getLDA(roll_lda2)))
  expect_true(is.LDA(getLDA(roll_proto2)))
  expect_true(is.LDA(getLDA(init_proto)))
  expect_true(is.LDA(getLDA(init_proto_updated1)))
  expect_true(is.LDA(getLDA(init_proto_updated2)))
  expect_true(is.LDA(getLDA(init_proto_updated3)))
})

test_that("chunks: match statistics with expactations", {
  expect_equal(getChunks(roll_lda),
               data.table(
                 chunk.id = 0:8L,
                 start.date = as.Date(c("2007-01-01",
                                        "2007-01-21",
                                        "2007-04-20",
                                        "2007-07-20",
                                        "2007-10-20",
                                        "2008-01-21",
                                        "2008-04-23",
                                        "2008-07-21",
                                        "2008-10-20")),
                 end.date = as.Date(c("2007-01-19",
                                      "2007-04-18",
                                      "2007-07-19",
                                      "2007-10-17",
                                      "2008-01-19",
                                      "2008-04-17",
                                      "2008-07-17",
                                      "2008-10-18",
                                      "2008-12-29")),
                 memory = as.Date(c(NA,
                                    "2006-07-20",
                                    "2006-10-20",
                                    "2007-01-20",
                                    "2007-04-20",
                                    "2007-07-20",
                                    "2007-10-20",
                                    "2008-01-20",
                                    "2008-04-20")),
                 n = as.integer(c(21, 84, 92, 85, 65, 67, 65, 60, 33)),
                 n.discarded = as.integer(c(0, 0, 0, 0, 2, 1, 1, 0, 0)),
                 n.memory = as.integer(c(NA, 21, 105, 176, 177, 150, 132, 132, 125)),
                 n.vocab = as.integer(c(55, 547, 841, 1038, 1160, 1261, 1323, 1486, 1503)),
                 key = colnames(getChunks(roll_lda))))
  expect_equal(getChunks(roll_lda)[, 1:2],
               data.table(
                 chunk.id = 0:8L,
                 start.date = as.Date(c("2007-01-01",
                                        "2007-01-21",
                                        "2007-04-20",
                                        "2007-07-20",
                                        "2007-10-20",
                                        "2008-01-21",
                                        "2008-04-23",
                                        "2008-07-21",
                                        "2008-10-20")), key = c("chunk.id", "start.date")))
})

test_that("types: lda, ldaprototype", {
  # roll_proto and roll_lda should only differ in lda (assignments ...) and id
  expect_identical(
    roll_proto[!names(roll_proto) %in% c("id", "lda")],
    roll_lda[!names(roll_lda) %in% c("id", "lda")])

  # roll_proto2 and roll_lda2 should only differ in lda (assignments ...)
  expect_identical(
    roll_proto2[names(roll_proto2) != "lda"],
    roll_lda2[names(roll_lda2) != "lda"])
})

test_that("unnamed dates", {
  seed = Sys.time()
  set.seed(seed)
  model1 = RollingLDA(economy_texts[1:200], unname(economy_dates[1:200]), "quarter", "6 month",
                      init = 20, K = 5, type = "lda")
  set.seed(seed)
  model2 = RollingLDA(economy_texts[1:200], economy_dates[1:200], "quarter", "6 month",
                      init = 20, K = 5, type = "lda")
  expect_identical(model1, model2)

  # keep in mind: dates must not be sorted, but are considered to be in the same
  # order as texts
  set.seed(seed)
  model2 = RollingLDA(economy_texts[1:200], sort(unname(economy_dates[1:200])), "quarter", "6 month",
                      init = 20, K = 5, type = "lda")
  expect_error(expect_identical(model1, model2))
})

test_that("init: date, ineger, character(date)", {
  chunk = unname(sort(economy_dates)[sample(180:220, 1)])
  init = unname(sort(economy_dates)[sample(80:120, 1)])
  mid = unname(sort(economy_dates)[sample(145:155, 1)])

  seed = Sys.time()

  # chunks as character identical to Date
  set.seed(seed)
  model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = init, memory = 20, K = 5, type = "lda")
  set.seed(seed)
  model2 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = as.character(init), memory = 20, K = 5, type = "lda")
  expect_identical(model1, model2)


  set.seed(seed)
  model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = c(init, mid), memory = 20, K = 5, type = "lda", vocab.abs = 0)
  set.seed(seed)
  model2 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = 20, K = 5, type = "lda", vocab.abs = 0)
  model2 = RollingLDA(model2, economy_texts[economy_dates >= mid & economy_dates <= chunk],
                      economy_dates[economy_dates >= mid & economy_dates <= chunk], memory = 20)
  expect_identical(getChunks(model1), getChunks(model2))
  expect_setequal(getVocab(model1), getVocab(model2))
  expect_mapequal(getDates(model1), getDates(model2))

  # needed if chunks character
  expect_error(RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                          chunks = "month", memory = 20, K = 5, type = "lda", vocab.abs = 0),
               "\"init\"")

  # not used if chunks dates
  set.seed(seed)
  model1 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = 20, K = 5, type = "lda", init = -12)
  set.seed(seed)
  model2 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = 20, K = 5, type = "lda", init = 0)
  set.seed(seed)
  model3 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = 20, K = 5, type = "lda", init = "999")
  expect_identical(model1, model2)
  expect_identical(model1, model3)

  # error: init smaller than minimum of dates
  expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                          chunks = min(economy_dates), memory = min(economy_dates) - 1, K = 5, type = "lda"),
               "lowest date")
})

test_that("memory: character, date, integer, character(date)", {
  chunk = unname(sort(economy_dates)[sample(180:220, 1)])
  init = unname(sort(economy_dates)[sample(80:120, 1)])
  mid = unname(sort(economy_dates)[sample(145:155, 1)])
  seed = Sys.time()

  # month = 1 month
  set.seed(seed)
  model1 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = "month", K = 5, type = "lda")
  set.seed(seed)
  model2 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = "1 month", K = 5, type = "lda")
  set.seed(seed)
  # tolower works
  model3 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = "1 mOnTh", K = 5, type = "lda")
  set.seed(seed)
  # date(s) = character dates
  memory1 = seq.Date(from = init, by = "-1 month", length.out = 2)[2]
  model4 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = memory1, K = 5, type = "lda")
  set.seed(seed)
  # character-date
  model5 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
                      chunks = init, memory = as.character(memory1), K = 5, type = "lda")
  expect_identical(model1, model2)
  expect_identical(model1, model3)
  expect_identical(model1, model4)
  expect_identical(model1, model5)
  seed = Sys.time()
  set.seed(seed)
  model1 = updateRollingLDA(model1,
                            economy_texts[economy_dates >= mid & economy_dates <= chunk],
                            economy_dates[economy_dates >= mid & economy_dates <= chunk],
                            memory = "month")
  set.seed(seed)
  model2 = updateRollingLDA(model2,
                            economy_texts[economy_dates >= mid & economy_dates <= chunk],
                            economy_dates[economy_dates >= mid & economy_dates <= chunk],
                            memory = "1 month")
  expect_identical(model1, model2)

  # memory unsorted
  expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                          chunks = c(init, mid), memory = c(memory1, memory1-1), K = 5, type = "lda"),
               "\"memory\" must be sorted")

  # memory > chunks
  expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                          chunks = c(init, mid), memory = c(memory1, mid+1), K = 5, type = "lda"),
               "\"memory\" must not be greater than the pendant in \"chunks\"")

  # memory rep is ok
  model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = c(init, mid), memory = c(memory1, memory1), K = 5, type = "lda")

  # memory empty -> warning
  expect_warning(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                            chunks = c(init, mid), memory = c(memory1, mid), K = 5, type = "lda"),
                 "there are no texts as memory for this chunk - skip chunk")

  # memory.fallback -> warning
  expect_warning(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                            chunks = c(init, mid), memory = c(memory1, mid), K = 5, type = "lda", memory.fallback = 5),
                 "there are no texts as memory for this chunk - using \"memory.fallback\"")

  # 1month -> error
  expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                          chunks = init, memory = "1month", K = 5, type = "lda"))
  # memory vector
  expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                          chunks = c(init, mid), memory = c(memory1, "12"), K = 5, type = "lda"))

  ## updateRollingLDA ##
  # not RollingLDA object
  expect_error(updateRollingLDA(economy_texts[economy_dates >= mid & economy_dates <= chunk],
                                economy_dates[economy_dates >= mid & economy_dates <= chunk],
                                chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                                memory = 4), "not a RollingLDA object")

  # memory missing
  expect_error(updateRollingLDA(model3,
                                economy_texts[economy_dates >= mid & economy_dates <= chunk],
                                economy_dates[economy_dates >= mid & economy_dates <= chunk],
                                chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk])))

  # should be fine:
  seed = Sys.time()
  set.seed(seed)
  model4 = updateRollingLDA(model3,
                            economy_texts[economy_dates >= mid & economy_dates <= chunk],
                            economy_dates[economy_dates >= mid & economy_dates <= chunk],
                            chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                            memory = 4)
  set.seed(seed)
  model5 = updateRollingLDA(model3,
                            economy_texts[economy_dates >= mid & economy_dates <= chunk],
                            unname(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                            chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                            memory = 4)
  expect_identical(model4, model5)
  # should be fine:
  model4 = updateRollingLDA(model3,
                            economy_texts[economy_dates >= mid & economy_dates <= chunk],
                            economy_dates[economy_dates >= mid & economy_dates <= chunk],
                            chunks = as.character(range(economy_dates[economy_dates >= mid & economy_dates <= chunk])),
                            memory = range(economy_dates[economy_dates >= mid & economy_dates <= chunk])-10)
  # memory unsorted
  expect_error(updateRollingLDA(model3,
                                economy_texts[economy_dates >= mid & economy_dates <= chunk],
                                economy_dates[economy_dates >= mid & economy_dates <= chunk],
                                chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                                memory = c(min(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                                           min(economy_dates[economy_dates >= mid & economy_dates <= chunk])-1)),
               "\"memory\" must be sorted")
  # memory > chunks
  expect_error(updateRollingLDA(model3,
                                economy_texts[economy_dates >= mid & economy_dates <= chunk],
                                economy_dates[economy_dates >= mid & economy_dates <= chunk],
                                chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
                                memory = range(economy_dates[economy_dates >= mid & economy_dates <= chunk])+1),
               "\"memory\" must not be greater than the pendant in \"chunks\"")
  # chunks unsorted
  expect_error(updateRollingLDA(model3,
                                economy_texts[economy_dates >= mid & economy_dates <= chunk],
                                economy_dates[economy_dates >= mid & economy_dates <= chunk],
                                chunks = rev(range(economy_dates[economy_dates >= mid & economy_dates <= chunk])),
                                memory = 4), "\"chunks\" must be sorted")
})

test_that("chunks: character, date, character(date)", {
  # TODO

  chunk = unname(sort(economy_dates)[sample(180:220, 1)])
  init = unname(sort(economy_dates)[sample(80:120, 1)])
  memory1 = seq.Date(from = init, by = "-1 month", length.out = 2)[2]
  seed = Sys.time()

  # chunks as character identical to Date
  set.seed(seed)
  model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = init, memory = 20, K = 5, type = "lda")
  set.seed(seed)
  model2 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = as.character(init), memory = 20, K = 5, type = "lda")
  expect_identical(model1, model2)

  # date(s)

  # character chunks = date(s)
  chunks = seq.Date(init+1, max(economy_dates[economy_dates <= chunk]), "quarter")

  set.seed(seed)
  model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = chunks, memory = 10, init = init, K = 5, type = "lda")
  set.seed(seed)
  model2 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                      chunks = "quarter", memory = 10, init = init, K = 5, type = "lda")
  expect_identical(model1, model2)

  # chunks unsorted
  expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                          chunks = c(init, init-1), memory = c(memory1, memory1), K = 5, type = "lda"),
               "\"chunks\" must be sorted")
  # chunks rep -> empty chunk -> warning
  expect_warning(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
                            chunks = c(init, init), memory = c(memory1, memory1), K = 5, type = "lda"),
                 "there are no texts in this chunk - skip chunk")
})


test_that("staged setting: updateRollingLDA", {
  # init_proto_updated1 and 2 are identical, but 3 is different (different seed)!
  expect_identical(init_proto_updated1, init_proto_updated2)
  expect_error(expect_identical(init_proto_updated1, init_proto_updated3))
  # roll_proto2 and init_proto_updated1 should only differ in lda and
  # first element of n.discarded in chunks table
  expect_identical(
    roll_proto2[!names(roll_proto2) %in% c("lda", "chunks")],
    init_proto_updated1[!names(init_proto_updated1) %in% c("lda", "chunks")])
  # initialized RollingLDA with unknown discarded texts in chunk 0
  expect_equal(
    getChunks(init_proto_updated1)$n.discarded[1],
    NA_integer_)
  # rest of chunks should be equal
  expect_identical(
    getChunks(roll_proto2)[-1,],
    getChunks(init_proto_updated1)[-1,])
})

test_that("Parameter RollingLDA match", {
  expect_identical(getParam(roll_proto),
                   getParam(roll_lda))
  expect_identical(getParam(roll_proto2),
                   getParam(roll_lda))
  expect_identical(getParam(roll_lda2),
                   getParam(roll_lda))
  expect_identical(getParam(init_proto_updated1),
                   getParam(roll_lda))
  expect_identical(rollinglda:::.defaultParam(),
                   getParam(roll_lda))

})

test_that("Parameter LDA match", {
  expect_identical(getParam(getLDA(roll_proto)),
                   getParam(getLDA(roll_lda)))
  expect_equal(getK(getLDA(roll_proto)), 5)
  expect_equal(getAlpha(getLDA(roll_proto)), 1/5)
  expect_equal(getEta(getLDA(roll_proto)), 1/5)
  expect_equal(getNum.iterations(getLDA(roll_proto)), 200)

  expect_equal(getK(getLDA(roll_proto2)), 10)
  expect_equal(getAlpha(getLDA(roll_proto2)), 1)
  expect_equal(getEta(getLDA(roll_proto2)), 1)
  expect_equal(getNum.iterations(getLDA(roll_proto2)), 20)
  expect_identical(getParam(getLDA(roll_proto2)),
                   getParam(getLDA(roll_lda2)))
  expect_identical(getParam(getLDA(roll_proto2)),
                   getParam(getLDA(roll_lda2)))
})

test_that("print.RollingLDA", {
  # prints not equal
  expect_error(expect_output(print(roll_lda), capture.output(print(roll_lda2))))
  # RollingLDA Object
  expect_output(print(roll_lda), "RollingLDA Object")
  expect_output(print(roll_lda2), "RollingLDA Object")

  # LDA print is part of RollingLDA print
  for(i in capture.output(print(getLDA(roll_lda)))[1:5]){
    expect_output(print(roll_lda), i, fixed = TRUE)
  }
  for(i in capture.output(print(getLDA(roll_lda2)))[1:5]){
    expect_output(print(roll_lda2), i, fixed = TRUE)
  }
})
Any scripts or data that you put into this service are public.
rollinglda documentation built on May 29, 2024, 3:32 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
rollinglda
Construct Consistent Time Series from Textual Data

tests/testthat/test_RollingLDA.R
In rollinglda: Construct Consistent Time Series from Textual Data

Try the rollinglda package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rollinglda Construct Consistent Time Series from Textual Data

tests/testthat/test_RollingLDA.R In rollinglda: Construct Consistent Time Series from Textual Data

Try the rollinglda package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rollinglda
Construct Consistent Time Series from Textual Data

tests/testthat/test_RollingLDA.R
In rollinglda: Construct Consistent Time Series from Textual Data