Nothing
context("RollingLDA")
data("economy_texts")
data("economy_dates")
roll_proto = RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
init = 20, K = 5, n = 10)
roll_lda = RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
init = 20, K = 5, type = "lda")
roll_proto2 = RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
init = 100, K = 10, alpha = 1, eta = 1, num.iterations = 20,
n = 10, id = "myid", seeds = 1:10)
roll_lda2 = RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
init = 100, K = 10, alpha = 1, eta = 1, num.iterations = 20,
id = "myid", type = "lda")
docs = getDocs(roll_proto2, names = getNames(roll_proto2)[1:getChunks(roll_proto2)$n[1]])
vocab = getVocab(roll_proto2)[1:getChunks(roll_proto2)$n.vocab[1]]
dates = getDates(roll_proto2)[1:getChunks(roll_proto2)$n[1]]
proto2 = LDAPrototype(docs, vocab, n = 10, seeds = 1:10,
K = 10, alpha = 1, eta = 1, num.iterations = 20)
init_proto = as.RollingLDA(lda = getLDA(proto2), dates = dates, docs = docs, id = "myid")
seed = Sys.time()
set.seed(seed)
init_proto_updated1 = RollingLDA(init_proto,
economy_texts[!names(economy_texts) %in% names(docs)],
economy_dates[!names(economy_dates) %in% names(docs)],
"quarter", "6 month")
set.seed(seed)
init_proto_updated2 = updateRollingLDA(init_proto,
economy_texts[!names(economy_texts) %in% names(docs)],
economy_dates[!names(economy_dates) %in% names(docs)],
"quarter", "6 month")
init_proto_updated3 = updateRollingLDA(init_proto,
economy_texts[!names(economy_texts) %in% names(docs)],
economy_dates[!names(economy_dates) %in% names(docs)],
"quarter", "6 month")
test_that("is.RollingLDA", {
expect_true(is.RollingLDA(roll_lda))
expect_true(is.RollingLDA(roll_lda, verbose = TRUE))
expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "id")
expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "lda")
expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "docs")
expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "dates")
expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "vocab")
expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "chunks")
expect_message(is.RollingLDA(roll_lda, verbose = TRUE), "param")
tmp = roll_lda
ind = sample(getChunks(tmp)$chunk.id, 1)
tmp$chunks$n[ind+1] = tmp$chunks$n[ind+1] + 1L
expect_false(is.RollingLDA(tmp))
expect_false(is.RollingLDA(tmp, verbose = TRUE))
expect_message(is.RollingLDA(tmp, verbose = TRUE),
"sum of \"n\" does not match number of texts")
expect_true(is.RollingLDA(roll_proto))
expect_true(is.RollingLDA(roll_lda2))
expect_true(is.RollingLDA(roll_proto2))
expect_true(is.RollingLDA(init_proto))
expect_true(is.RollingLDA(init_proto_updated1))
expect_true(is.RollingLDA(init_proto_updated2))
expect_true(is.RollingLDA(init_proto_updated3))
# no docs left after preproc
expect_error(RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
init = 20, K = 5, doc.abs = 600, type = "lda"),
"after preprocessing there are no texts left for the initial model")
expect_error(RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
init = 20, K = 5, doc.abs = 60, type = "lda"),
"after preprocessing there is only one text left for the initial model")
expect_error(RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
init = 20, K = 5, vocab.abs = 40, type = "lda"),
"Assertion on 'vocab' failed")
expect_error(RollingLDA(economy_texts, economy_dates, "quarter", "6 month",
init = 20, K = 5, vocab.abs = 41, type = "lda"),
"after preprocessing there are no texts left for the initial model")
})
test_that("is.LDA", {
expect_true(is.LDA(getLDA(roll_lda)))
expect_true(is.LDA(getLDA(roll_lda), verbose = TRUE))
expect_true(is.LDA(getLDA(roll_proto)))
expect_true(is.LDA(getLDA(roll_lda2)))
expect_true(is.LDA(getLDA(roll_proto2)))
expect_true(is.LDA(getLDA(init_proto)))
expect_true(is.LDA(getLDA(init_proto_updated1)))
expect_true(is.LDA(getLDA(init_proto_updated2)))
expect_true(is.LDA(getLDA(init_proto_updated3)))
})
test_that("chunks: match statistics with expactations", {
expect_equal(getChunks(roll_lda),
data.table(
chunk.id = 0:8L,
start.date = as.Date(c("2007-01-01",
"2007-01-21",
"2007-04-20",
"2007-07-20",
"2007-10-20",
"2008-01-21",
"2008-04-23",
"2008-07-21",
"2008-10-20")),
end.date = as.Date(c("2007-01-19",
"2007-04-18",
"2007-07-19",
"2007-10-17",
"2008-01-19",
"2008-04-17",
"2008-07-17",
"2008-10-18",
"2008-12-29")),
memory = as.Date(c(NA,
"2006-07-20",
"2006-10-20",
"2007-01-20",
"2007-04-20",
"2007-07-20",
"2007-10-20",
"2008-01-20",
"2008-04-20")),
n = as.integer(c(21, 84, 92, 85, 65, 67, 65, 60, 33)),
n.discarded = as.integer(c(0, 0, 0, 0, 2, 1, 1, 0, 0)),
n.memory = as.integer(c(NA, 21, 105, 176, 177, 150, 132, 132, 125)),
n.vocab = as.integer(c(55, 547, 841, 1038, 1160, 1261, 1323, 1486, 1503)),
key = colnames(getChunks(roll_lda))))
expect_equal(getChunks(roll_lda)[, 1:2],
data.table(
chunk.id = 0:8L,
start.date = as.Date(c("2007-01-01",
"2007-01-21",
"2007-04-20",
"2007-07-20",
"2007-10-20",
"2008-01-21",
"2008-04-23",
"2008-07-21",
"2008-10-20")), key = c("chunk.id", "start.date")))
})
test_that("types: lda, ldaprototype", {
# roll_proto and roll_lda should only differ in lda (assignments ...) and id
expect_identical(
roll_proto[!names(roll_proto) %in% c("id", "lda")],
roll_lda[!names(roll_lda) %in% c("id", "lda")])
# roll_proto2 and roll_lda2 should only differ in lda (assignments ...)
expect_identical(
roll_proto2[names(roll_proto2) != "lda"],
roll_lda2[names(roll_lda2) != "lda"])
})
test_that("unnamed dates", {
seed = Sys.time()
set.seed(seed)
model1 = RollingLDA(economy_texts[1:200], unname(economy_dates[1:200]), "quarter", "6 month",
init = 20, K = 5, type = "lda")
set.seed(seed)
model2 = RollingLDA(economy_texts[1:200], economy_dates[1:200], "quarter", "6 month",
init = 20, K = 5, type = "lda")
expect_identical(model1, model2)
# keep in mind: dates must not be sorted, but are considered to be in the same
# order as texts
set.seed(seed)
model2 = RollingLDA(economy_texts[1:200], sort(unname(economy_dates[1:200])), "quarter", "6 month",
init = 20, K = 5, type = "lda")
expect_error(expect_identical(model1, model2))
})
test_that("init: date, ineger, character(date)", {
chunk = unname(sort(economy_dates)[sample(180:220, 1)])
init = unname(sort(economy_dates)[sample(80:120, 1)])
mid = unname(sort(economy_dates)[sample(145:155, 1)])
seed = Sys.time()
# chunks as character identical to Date
set.seed(seed)
model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = init, memory = 20, K = 5, type = "lda")
set.seed(seed)
model2 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = as.character(init), memory = 20, K = 5, type = "lda")
expect_identical(model1, model2)
set.seed(seed)
model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = c(init, mid), memory = 20, K = 5, type = "lda", vocab.abs = 0)
set.seed(seed)
model2 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
chunks = init, memory = 20, K = 5, type = "lda", vocab.abs = 0)
model2 = RollingLDA(model2, economy_texts[economy_dates >= mid & economy_dates <= chunk],
economy_dates[economy_dates >= mid & economy_dates <= chunk], memory = 20)
expect_identical(getChunks(model1), getChunks(model2))
expect_setequal(getVocab(model1), getVocab(model2))
expect_mapequal(getDates(model1), getDates(model2))
# needed if chunks character
expect_error(RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
chunks = "month", memory = 20, K = 5, type = "lda", vocab.abs = 0),
"\"init\"")
# not used if chunks dates
set.seed(seed)
model1 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
chunks = init, memory = 20, K = 5, type = "lda", init = -12)
set.seed(seed)
model2 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
chunks = init, memory = 20, K = 5, type = "lda", init = 0)
set.seed(seed)
model3 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
chunks = init, memory = 20, K = 5, type = "lda", init = "999")
expect_identical(model1, model2)
expect_identical(model1, model3)
# error: init smaller than minimum of dates
expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = min(economy_dates), memory = min(economy_dates) - 1, K = 5, type = "lda"),
"lowest date")
})
test_that("memory: character, date, integer, character(date)", {
chunk = unname(sort(economy_dates)[sample(180:220, 1)])
init = unname(sort(economy_dates)[sample(80:120, 1)])
mid = unname(sort(economy_dates)[sample(145:155, 1)])
seed = Sys.time()
# month = 1 month
set.seed(seed)
model1 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
chunks = init, memory = "month", K = 5, type = "lda")
set.seed(seed)
model2 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
chunks = init, memory = "1 month", K = 5, type = "lda")
set.seed(seed)
# tolower works
model3 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
chunks = init, memory = "1 mOnTh", K = 5, type = "lda")
set.seed(seed)
# date(s) = character dates
memory1 = seq.Date(from = init, by = "-1 month", length.out = 2)[2]
model4 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
chunks = init, memory = memory1, K = 5, type = "lda")
set.seed(seed)
# character-date
model5 = RollingLDA(economy_texts[economy_dates < mid], economy_dates[economy_dates < mid],
chunks = init, memory = as.character(memory1), K = 5, type = "lda")
expect_identical(model1, model2)
expect_identical(model1, model3)
expect_identical(model1, model4)
expect_identical(model1, model5)
seed = Sys.time()
set.seed(seed)
model1 = updateRollingLDA(model1,
economy_texts[economy_dates >= mid & economy_dates <= chunk],
economy_dates[economy_dates >= mid & economy_dates <= chunk],
memory = "month")
set.seed(seed)
model2 = updateRollingLDA(model2,
economy_texts[economy_dates >= mid & economy_dates <= chunk],
economy_dates[economy_dates >= mid & economy_dates <= chunk],
memory = "1 month")
expect_identical(model1, model2)
# memory unsorted
expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = c(init, mid), memory = c(memory1, memory1-1), K = 5, type = "lda"),
"\"memory\" must be sorted")
# memory > chunks
expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = c(init, mid), memory = c(memory1, mid+1), K = 5, type = "lda"),
"\"memory\" must not be greater than the pendant in \"chunks\"")
# memory rep is ok
model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = c(init, mid), memory = c(memory1, memory1), K = 5, type = "lda")
# memory empty -> warning
expect_warning(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = c(init, mid), memory = c(memory1, mid), K = 5, type = "lda"),
"there are no texts as memory for this chunk - skip chunk")
# memory.fallback -> warning
expect_warning(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = c(init, mid), memory = c(memory1, mid), K = 5, type = "lda", memory.fallback = 5),
"there are no texts as memory for this chunk - using \"memory.fallback\"")
# 1month -> error
expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = init, memory = "1month", K = 5, type = "lda"))
# memory vector
expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = c(init, mid), memory = c(memory1, "12"), K = 5, type = "lda"))
## updateRollingLDA ##
# not RollingLDA object
expect_error(updateRollingLDA(economy_texts[economy_dates >= mid & economy_dates <= chunk],
economy_dates[economy_dates >= mid & economy_dates <= chunk],
chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
memory = 4), "not a RollingLDA object")
# memory missing
expect_error(updateRollingLDA(model3,
economy_texts[economy_dates >= mid & economy_dates <= chunk],
economy_dates[economy_dates >= mid & economy_dates <= chunk],
chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk])))
# should be fine:
seed = Sys.time()
set.seed(seed)
model4 = updateRollingLDA(model3,
economy_texts[economy_dates >= mid & economy_dates <= chunk],
economy_dates[economy_dates >= mid & economy_dates <= chunk],
chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
memory = 4)
set.seed(seed)
model5 = updateRollingLDA(model3,
economy_texts[economy_dates >= mid & economy_dates <= chunk],
unname(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
memory = 4)
expect_identical(model4, model5)
# should be fine:
model4 = updateRollingLDA(model3,
economy_texts[economy_dates >= mid & economy_dates <= chunk],
economy_dates[economy_dates >= mid & economy_dates <= chunk],
chunks = as.character(range(economy_dates[economy_dates >= mid & economy_dates <= chunk])),
memory = range(economy_dates[economy_dates >= mid & economy_dates <= chunk])-10)
# memory unsorted
expect_error(updateRollingLDA(model3,
economy_texts[economy_dates >= mid & economy_dates <= chunk],
economy_dates[economy_dates >= mid & economy_dates <= chunk],
chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
memory = c(min(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
min(economy_dates[economy_dates >= mid & economy_dates <= chunk])-1)),
"\"memory\" must be sorted")
# memory > chunks
expect_error(updateRollingLDA(model3,
economy_texts[economy_dates >= mid & economy_dates <= chunk],
economy_dates[economy_dates >= mid & economy_dates <= chunk],
chunks = range(economy_dates[economy_dates >= mid & economy_dates <= chunk]),
memory = range(economy_dates[economy_dates >= mid & economy_dates <= chunk])+1),
"\"memory\" must not be greater than the pendant in \"chunks\"")
# chunks unsorted
expect_error(updateRollingLDA(model3,
economy_texts[economy_dates >= mid & economy_dates <= chunk],
economy_dates[economy_dates >= mid & economy_dates <= chunk],
chunks = rev(range(economy_dates[economy_dates >= mid & economy_dates <= chunk])),
memory = 4), "\"chunks\" must be sorted")
})
test_that("chunks: character, date, character(date)", {
# TODO
chunk = unname(sort(economy_dates)[sample(180:220, 1)])
init = unname(sort(economy_dates)[sample(80:120, 1)])
memory1 = seq.Date(from = init, by = "-1 month", length.out = 2)[2]
seed = Sys.time()
# chunks as character identical to Date
set.seed(seed)
model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = init, memory = 20, K = 5, type = "lda")
set.seed(seed)
model2 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = as.character(init), memory = 20, K = 5, type = "lda")
expect_identical(model1, model2)
# date(s)
# character chunks = date(s)
chunks = seq.Date(init+1, max(economy_dates[economy_dates <= chunk]), "quarter")
set.seed(seed)
model1 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = chunks, memory = 10, init = init, K = 5, type = "lda")
set.seed(seed)
model2 = RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = "quarter", memory = 10, init = init, K = 5, type = "lda")
expect_identical(model1, model2)
# chunks unsorted
expect_error(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = c(init, init-1), memory = c(memory1, memory1), K = 5, type = "lda"),
"\"chunks\" must be sorted")
# chunks rep -> empty chunk -> warning
expect_warning(RollingLDA(economy_texts[economy_dates <= chunk], economy_dates[economy_dates <= chunk],
chunks = c(init, init), memory = c(memory1, memory1), K = 5, type = "lda"),
"there are no texts in this chunk - skip chunk")
})
test_that("staged setting: updateRollingLDA", {
# init_proto_updated1 and 2 are identical, but 3 is different (different seed)!
expect_identical(init_proto_updated1, init_proto_updated2)
expect_error(expect_identical(init_proto_updated1, init_proto_updated3))
# roll_proto2 and init_proto_updated1 should only differ in lda and
# first element of n.discarded in chunks table
expect_identical(
roll_proto2[!names(roll_proto2) %in% c("lda", "chunks")],
init_proto_updated1[!names(init_proto_updated1) %in% c("lda", "chunks")])
# initialized RollingLDA with unknown discarded texts in chunk 0
expect_equal(
getChunks(init_proto_updated1)$n.discarded[1],
NA_integer_)
# rest of chunks should be equal
expect_identical(
getChunks(roll_proto2)[-1,],
getChunks(init_proto_updated1)[-1,])
})
test_that("Parameter RollingLDA match", {
expect_identical(getParam(roll_proto),
getParam(roll_lda))
expect_identical(getParam(roll_proto2),
getParam(roll_lda))
expect_identical(getParam(roll_lda2),
getParam(roll_lda))
expect_identical(getParam(init_proto_updated1),
getParam(roll_lda))
expect_identical(rollinglda:::.defaultParam(),
getParam(roll_lda))
})
test_that("Parameter LDA match", {
expect_identical(getParam(getLDA(roll_proto)),
getParam(getLDA(roll_lda)))
expect_equal(getK(getLDA(roll_proto)), 5)
expect_equal(getAlpha(getLDA(roll_proto)), 1/5)
expect_equal(getEta(getLDA(roll_proto)), 1/5)
expect_equal(getNum.iterations(getLDA(roll_proto)), 200)
expect_equal(getK(getLDA(roll_proto2)), 10)
expect_equal(getAlpha(getLDA(roll_proto2)), 1)
expect_equal(getEta(getLDA(roll_proto2)), 1)
expect_equal(getNum.iterations(getLDA(roll_proto2)), 20)
expect_identical(getParam(getLDA(roll_proto2)),
getParam(getLDA(roll_lda2)))
expect_identical(getParam(getLDA(roll_proto2)),
getParam(getLDA(roll_lda2)))
})
test_that("print.RollingLDA", {
# prints not equal
expect_error(expect_output(print(roll_lda), capture.output(print(roll_lda2))))
# RollingLDA Object
expect_output(print(roll_lda), "RollingLDA Object")
expect_output(print(roll_lda2), "RollingLDA Object")
# LDA print is part of RollingLDA print
for(i in capture.output(print(getLDA(roll_lda)))[1:5]){
expect_output(print(roll_lda), i, fixed = TRUE)
}
for(i in capture.output(print(getLDA(roll_lda2)))[1:5]){
expect_output(print(roll_lda2), i, fixed = TRUE)
}
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.