Nothing
require(quanteda)
# create and save test object
# corp_sent <- corpus_reshape(data_corpus_inaugural, "sentence")
# toks_test <- tokens(corp_sent, remove_punct = TRUE)
# saveRDS(toks_test, "tests/data/tokens_test.RDS")
toks_test <- readRDS("../data/tokens_test.RDS")
feat_test <- head(char_context(toks_test, "america*", min_count = 1, p = 0.05), 100)
dfmt_test <- dfm(toks_test)
fcmt_test <- fcm(dfmt_test)
seed <- as.seedwords(data_dictionary_sentiment)
lss_test <- textmodel_lss(dfmt_test, seed, terms = feat_test, k = 300,
include_data = TRUE)
lss_test_nd <- textmodel_lss(dfmt_test, seed, terms = feat_test, k = 300,
include_data = FALSE)
lss_test_ss <- textmodel_lss(dfmt_test, seed[1], terms = feat_test, k = 300)
lss_test_fcm <- textmodel_lss(fcmt_test, seed, terms = feat_test, w = 50)
test_that("char_context is working", {
expect_identical(length(feat_test), 100L)
feat1 <- char_context(toks_test, "america.*", "regex", min_count = 1, p = 0.05)
expect_identical(head(feat1, 100), feat_test)
feat2 <- char_context(toks_test, "America*", case_insensitive = FALSE, min_count = 1, p = 0.05)
expect_identical(head(feat2, 100), feat_test)
feat3 <- char_context(toks_test, "america*", min_count = 1000, remove_pattern = TRUE)
expect_identical(feat3, character())
feat4 <- char_context(toks_test, "america*", min_count = 1000, remove_pattern = FALSE)
expect_identical(feat4, character())
expect_silent(char_context(toks_test, "xxxxx", min_count = 1, p = 0.05))
})
test_that("textmodel_lss has all the attributes", {
expect_equal(
names(lss_test),
c("data", "beta", "k", "slice", "frequency", "terms", "seeds", "seeds_weighted",
"embedding", "similarity", "importance",
"concatenator", "dummy", "call")
)
expect_true(is.numeric(lss_test$beta))
expect_true(is.numeric(lss_test$frequency))
expect_identical(names(lss_test$beta), names(lss_test$frequency))
expect_true(is.dfm(lss_test$data))
expect_identical(lss_test$terms, feat_test)
expect_identical(names(lss_test$seeds), names(seedwords("pos-neg")))
expect_equal(
names(lss_test_fcm),
c("data", "beta", "k", "slice", "frequency", "terms", "seeds", "seeds_weighted",
"embedding", "similarity", "importance",
"concatenator", "dummy", "call")
)
expect_true(is.numeric(lss_test_fcm$beta))
expect_true(is.numeric(lss_test_fcm$frequency))
expect_identical(names(lss_test_fcm$beta), names(lss_test_fcm$frequency))
expect_true(is.null(lss_test_fcm$data))
expect_identical(lss_test_fcm$terms, feat_test)
expect_identical(names(lss_test_fcm$seeds), names(seedwords("pos-neg")))
expect_equal(
names(lss_test_nd),
c("data", "beta", "k", "slice", "frequency", "terms", "seeds", "seeds_weighted",
"embedding", "similarity", "importance",
"concatenator", "dummy", "call")
)
})
test_that("summary.textmodel_lss is working", {
expect_silent(summary(lss_test))
expect_silent(summary(lss_test_nd))
})
test_that("predict.textmodel_lss is working", {
expect_warning(predict(lss_test, xxx = TRUE),
"xxx argument is not used")
expect_warning(predict(lss_test, se.fit = TRUE),
"'se.fit' is deprecated; use 'se_fit'")
expect_error(predict(lss_test, newdata = matrix(1:10)),
"newdata must be a dfm")
expect_error(predict(lss_test_nd),
"The model includes no data, use newdata to supply a dfm.")
pred1 <- predict(lss_test)
expect_equal(length(pred1), ndoc(dfmt_test))
expect_identical(names(pred1), docnames(dfmt_test))
expect_true(is.numeric(pred1))
expect_equal(mean(pred1, na.rm = TRUE), 0)
expect_equal(sd(pred1, na.rm = TRUE), 1)
pred2 <- predict(lss_test, se_fit = TRUE)
expect_equal(length(pred2$fit), ndoc(dfmt_test))
expect_identical(names(pred2$fit), docnames(dfmt_test))
expect_equal(length(pred2$se.fit), ndoc(dfmt_test))
expect_equal(length(pred2$n), ndoc(dfmt_test))
expect_null(names(pred2$se.fit))
expect_null(names(pred2$n))
pred3 <- predict(lss_test, density = TRUE)
expect_equal(length(pred3$density), ndoc(toks_test))
expect_null(names(pred3$density))
pred4 <- predict(lss_test, rescale = FALSE)
expect_identical(names(pred4), docnames(toks_test))
expect_equal(as.numeric(scale(pred4)), unname(pred1))
pred5 <- predict(lss_test, se_fit = TRUE, density = TRUE)
expect_equal(names(pred5), c("fit", "se.fit", "n", "density"))
pred6 <- predict(lss_test, rescale = FALSE, min_n = 2)
expect_true(all(is.na(pred4) == is.na(pred6)))
expect_true(all(abs(pred6[pred5$n == 1]) < abs(pred4[pred5$n == 1]), na.rm = TRUE))
expect_true(all(abs(pred6[pred5$n >= 2]) == abs(pred4[pred5$n >= 2]), na.rm = TRUE))
expect_error(
predict(lss_test, rescale = FALSE, min_n = -1),
"The value of min_n must be between 0 and Inf"
)
expect_error(
predict(lss_test, rescale = FALSE, min_n = c(0, 1)),
"The length of min_n must be 1"
)
})
test_that("density is correct", {
dfmt <- dfm_group(dfm(toks_test))
pred <- predict(lss_test, newdata = dfmt, density = TRUE)
expect_equal(
pred$density,
unname(rowSums(dfm_select(dfm_weight(dfmt, "prop"), feat_test)))
)
})
test_that("predict.textmodel_lss works with newdata", {
dfmt <- dfm_group(dfm(toks_test))
pred <- predict(lss_test, newdata = dfmt)
expect_equal(length(pred), ndoc(dfmt))
})
test_that("data object is valid", {
sum <- summary(data_textmodel_lss_russianprotests)
expect_equal(class(sum), c("summary.textmodel", "list"))
})
test_that("calculation of fit and se_fit are correct", {
lss <- as.textmodel_lss(c("a" = 0.1, "b" = 0.1, "c" = 0.3))
toks <- tokens(c("a a a", "a b", "a a b c c d e"))
dfmt <- dfm(toks)
pred <- predict(lss, newdata = dfmt, se_fit = TRUE, rescale = FALSE)
expect_equal(pred$fit[1], c(text1 = 0.10))
expect_equal(pred$fit[2], c(text2 = 0.10))
expect_equal(pred$fit[3], c(text3 = 0.1 * (2 / 5) + 0.1 * (1 / 5) + 0.3 * (2 / 5)))
beta <- lss$beta
dfmt_sub <- dfm_select(dfmt, names(beta))
dfmt_prop <- dfm_weight(dfmt_sub, "prop")
expect_equal(pred$se.fit[1],
unname(sqrt(sum(as.numeric(dfmt_prop[1,]) * (pred$fit[1] - beta) ^ 2)) / sqrt(rowSums(dfmt_sub)[1])))
expect_equal(pred$se.fit[2],
unname(sqrt(sum(as.numeric(dfmt_prop[2,]) * (pred$fit[2] - beta) ^ 2)) / sqrt(rowSums(dfmt_sub)[2])))
expect_equal(pred$se.fit[3],
unname(sqrt(sum(as.numeric(dfmt_prop[3,]) * (pred$fit[3] - beta) ^ 2)) / sqrt(rowSums(dfmt_sub)[3])))
expect_equal(pred$n[1], 3)
expect_equal(pred$n[2], 2)
expect_equal(pred$n[3], 5)
})
test_that("textmodel_lss works with only with single seed", {
skip_on_cran()
expect_silent(textmodel_lss(dfm(toks_test), seedwords("pos-neg")[1], terms = feat_test, k = 10))
expect_silent(textmodel_lss(dfm(toks_test), seedwords("pos-neg")[1], terms = character(), k = 10))
expect_silent(textmodel_lss(dfm(toks_test), seedwords("pos-neg")[1], k = 10))
})
test_that("textmodel_lss.fcm works with ...", {
skip_on_cran()
expect_warning(textmodel_lss(fcmt_test, seedwords("pos-neg"),
terms = feat_test, learning_rate = 0.1), NA)
expect_warning(textmodel_lss(fcmt_test, seedwords("pos-neg"),
terms = feat_test, alpha = 1), NA)
})
test_that("terms is working", {
skip_on_cran()
# glob pattern
lss1 <- textmodel_lss(dfmt_test, seed, terms = "poli*", k = 300)
expect_true(all(stringi::stri_startswith_fixed(names(lss1$beta), "poli")))
# numeric vector
weight <- sample(1:10, length(lss1$beta), replace = TRUE) / 10
names(weight) <- names(lss1$beta)
lss2 <- textmodel_lss(dfmt_test, seed, terms = weight, k = 300)
expect_true(all(lss2$beta == lss1$beta * weight))
expect_error(textmodel_lss(dfmt_test, seed, terms = c("polity" = 0.2, "politic" = -0.1), k = 300),
"terms must be positive values without NA")
expect_error(textmodel_lss(dfmt_test, seed, terms = c("polity" = 0.2, "politic" = NA), k = 300),
"terms must be positive values without NA")
expect_error(textmodel_lss(dfmt_test, seed, terms = c(01, 0.2), k = 300),
"terms must be named")
})
test_that("terms work with numeric vector", {
lss <- textmodel_lss(dfmt_test, seed, terms = "poli*", k = 300)
expect_true(all(stringi::stri_startswith_fixed(names(coef(lss)), "poli")))
})
test_that("simil_method works", {
lss_cos <- textmodel_lss(dfm(toks_test), seedwords("pos-neg")[1], terms = feat_test,
k = 10)
lss_cor <- textmodel_lss(dfm(toks_test), seedwords("pos-neg")[1], terms = feat_test,
simil_method = "correlation", k = 10)
expect_false(identical(lss_cos, lss_cor))
expect_error(textmodel_lss(dfm(toks_test), seedwords("pos-neg")[1], terms = feat_test,
simil_method = "something", k = 10), "'arg' should be one of")
})
test_that("include_data is working", {
skip_on_cran() # takes to much time
dfmt <- dfm(toks_test)
lss <- textmodel_lss(dfmt, seedwords("pos-neg"), k = 10, include_data = TRUE)
lss_nd <- textmodel_lss(dfmt, seedwords("pos-neg"), k = 10, include_data = FALSE)
expect_error(predict(lss_nd), "The model includes no data")
expect_identical(predict(lss), predict(lss_nd, newdata = dfmt))
lss_gd <- textmodel_lss(dfmt, seedwords("pos-neg"), k = 10,
include_data = TRUE, group_data = TRUE)
expect_equal(names(predict(lss_gd)), docnames(dfm_group(dfmt)))
expect_warning(
textmodel_lss(dfmt, seedwords("pos-neg"), k = 10,
include_data = FALSE, group_data = TRUE),
"group_data is ignored when include_data = FALSE"
)
})
test_that("predict.textmodel_lss computes scores correctly", {
dfmt <- dfm_group(dfm(toks_test))
dfmt[c(3, 10),] <- 0
dfmt <- as.dfm(dfmt)
pred <- predict(lss_test, newdata = dfmt)
expect_equal(length(pred), ndoc(dfmt))
expect_equal(is.na(pred[c("1789-Washington", "1797-Adams", "1825-Adams")]),
c("1789-Washington" = FALSE, "1797-Adams" = TRUE, "1825-Adams" = TRUE))
pred2 <- predict(lss_test, newdata = dfmt, se_fit = TRUE)
expect_equal(is.na(pred2$fit[c("1789-Washington", "1797-Adams", "1825-Adams")]),
c("1789-Washington" = FALSE, "1797-Adams" = TRUE, "1825-Adams" = TRUE))
expect_equal(is.na(pred2$se.fit[c(1, 3, 10)]), c(FALSE, TRUE, TRUE))
expect_equal(pred2$n[c(1, 3, 10)] == 0, c(FALSE, TRUE, TRUE))
pred3 <- predict(lss_test, newdata = dfmt, se_fit = TRUE, min_n = 2)
expect_equal(is.na(pred3$fit[c("1789-Washington", "1797-Adams", "1825-Adams")]),
c("1789-Washington" = FALSE, "1797-Adams" = TRUE, "1825-Adams" = TRUE))
expect_equal(is.na(pred3$se.fit[c(1, 3, 10)]), c(FALSE, TRUE, TRUE))
expect_equal(pred3$n[c(1, 3, 10)] == 0, c(FALSE, TRUE, TRUE))
load("../data/prediction_v0.99.RDA")
expect_equal(pred, pred_v099, tolerance = 0.0001)
expect_equal(pred2$fit, pred2_v099$fit, tolerance = 0.0001)
expect_equal(pred2$se.fit, pred2_v099$se.fit, tolerance = 0.0001)
expect_equal(pred2$n, pred2_v099$n)
})
test_that("textmodel_lss works with glob patterns", {
dfmt <- dfm(toks_test)
seed <- c("nice*" = 1, "positive*" = 1, "bad*" = -1, "negative*" = -1)
lss <- textmodel_lss(dfmt, seed, k = 10)
expect_equal(lss$seeds, seed)
expect_equal(names(lss$seeds_weighted),
c("positive", "positively", "badge", "bad", "badly", "negative"))
})
test_that("textmodel_lss works with non-existent seeds", {
seed1 <- c("good" = 1, "bad" = -1, "xyz" = -1)
expect_silent(textmodel_lss(dfmt_test, seed1, k = 10))
seed2 <- c("xyz", "xxx")
expect_error(textmodel_lss(dfmt_test, seed2, k = 10),
"No seed word is found in the dfm")
})
test_that("RSpectra and irlba work", {
expect_silent(textmodel_lss(dfmt_test, seedwords("pos-neg"), k = 10, engine = "RSpectra"))
expect_silent(textmodel_lss(dfmt_test, seedwords("pos-neg"), k = 10, engine = "irlba"))
})
test_that("text2vec works", {
skip_on_cran() # takes to much time
fcmt <- fcm(toks_test)
lss <- textmodel_lss(fcmt, seedwords("pos-neg"), engine = "rsparse")
expect_equal(
names(predict(lss, dfmt_test)),
docnames(dfmt_test)
)
expect_error(
predict(lss),
"The model includes no data"
)
expect_true(setequal(names(coef(lss)), colnames(fcmt)))
})
test_that("weight is working", {
lss1 <- textmodel_lss(dfmt_test, seedwords("pos-neg"), k = 10, weight = "count")
lss2 <- textmodel_lss(dfmt_test, seedwords("pos-neg"), k = 10, weight = "logcount")
expect_false(identical(lss1, lss2))
expect_error(
textmodel_lss(dfmt_test, seedwords("pos-neg"), k = 10, weight = "xxx")
)
})
test_that("slice argument is working", {
expect_identical(
dim(textmodel_lss(dfmt_test, seed, terms = feat_test, k = 300, slice = 100)$embedding),
dim(textmodel_lss(dfmt_test, seed, terms = feat_test, k = 300, slice = 1:100)$embedding)
)
expect_silent(
textmodel_lss(dfmt_test, seed, terms = feat_test, k = 300, slice = 1:100)
)
expect_error(
textmodel_lss(dfmt_test, seed, terms = feat_test, k = 300, slice = 1:400),
"The length of slice must be between 1 and 300"
)
})
test_that("test smooth_lss", {
skip_on_cran() # takes to much time
set.seed(1234)
dfmt <- dfm_sample(dfmt_test, size = 1000)
dat <- docvars(dfmt)
dat$lss <- predict(lss_test, newdata = dfmt)
dat$time <- as.Date(paste0(dat$Year, "-01-01"))
expect_silent(smooth_lss(dat, lss_var = "lss", date_var = "time"))
expect_error(
smooth_lss(dat),
"fit does not exist in x"
)
expect_error(
smooth_lss(smooth_lss(dat, lss_var = "President")),
"lss_var must be a numeric column"
)
expect_error(
smooth_lss(dat, lss_var = "lss"),
"date does not exist in x"
)
expect_error(
smooth_lss(dat, lss_var = "lss", date_var = "Year"),
"date_var must be a date column"
)
dat_loess <- smooth_lss(dat, lss_var = "lss", date_var = "time",
engine = "loess")
dat_locfit <- smooth_lss(dat, lss_var = "lss", date_var = "time",
engine = "locfit")
expect_true(cor(dat_loess$fit, dat_locfit$fit) > 0.90)
})
test_that("weight_seeds() works", {
expect_equal(
LSX:::weight_seeds(c("a*" = 1, "b*" = -1), c("aa", "aaa", "bb", "bbb")),
list("a*" = c("aa" = 0.5, "aaa" = 0.5),
"b*" = c("bb" = -0.5, "bbb" = -0.5))
)
expect_equal(
LSX:::weight_seeds(c("a*" = 1), c("aa", "aaa", "bb", "bbb")),
list("a*" = c("aa" = 0.5, "aaa" = 0.5))
)
expect_equal(
LSX:::weight_seeds(c("a*" = 1, "c*" = -1), c("aa", "aaa", "bb", "bbb")),
list("a*" = c("aa" = 0.5, "aaa" = 0.5),
"c*" = numeric())
)
expect_equal(
LSX:::weight_seeds(c("a*" = 1, "b*" = 1), c("aa", "aaa", "bb", "bbb")),
list("a*" = c("aa" = 0.25, "aaa" = 0.25),
"b*" = c("bb" = 0.25, "bbb" = 0.25))
)
expect_equal(
LSX:::weight_seeds(c("aa" = 1, "aaa" = 1, "bb" = 1), c("aa", "aaa", "bb", "bbb")),
list("aa" = c("aa" = 0.333),
"aaa" = c("aaa" = 0.333),
"bb" = c("bb" = 0.333)),
tolerance = 0.01
)
expect_equal(
LSX:::weight_seeds(c("aa" = 1, "aaa" = 1, "bb" = -1), c("aa", "aaa", "bb", "bbb")),
list("aa" = c("aa" = 0.5),
"aaa" = c("aaa" = 0.5),
"bb" = c("bb" = -1)),
)
})
test_that("old argument still works", {
skip_on_cran() # takes to much time
suppressWarnings({
lss <- textmodel_lss(dfmt_test, seed, features = feat_test, k = 300)
})
expect_equal(lss_test$terms, lss$terms)
suppressWarnings({
lss_fcm <- textmodel_lss(fcmt_test, seed, features = feat_test, w = 50)
})
expect_equal(lss_test$terms, lss_fcm$terms)
})
test_that("se_fit is working", {
beta <- c(a = 0.2, b = 0.1, z = 0)
lss <- as.textmodel_lss(beta)
dfmt1 <- dfm(tokens(c("a a a b b", "")))
dfmt2 <- dfm(tokens(c("a a a b b z z z z z", "")))
pred1 <- predict(lss, newdata = dfmt1, rescale = FALSE, min_n = 10, se_fit = TRUE)
pred2 <- predict(lss, newdata = dfmt2, rescale = FALSE, se_fit = TRUE)
expect_identical(pred1, pred2)
})
test_that("cut is working", {
skip_on_cran() # takes to much time
p0 <- predict(lss_test, rescale = TRUE, min_n = 10)
p1 <- predict(lss_test, cut = 0.5, rescale = TRUE)
expect_true(min(p1, na.rm = TRUE) < -1)
expect_true(max(p1, na.rm = TRUE) > 1)
expect_equal(cor(p0, p1, use = "pair"), 0.59, tolerance = 0.01)
p2 <- predict(lss_test, cut = 0.5, rescale = FALSE)
expect_true(min(p2, na.rm = TRUE) >= -1)
expect_true(max(p2, na.rm = TRUE) <= 1)
expect_equal(cor(p0, p2, use = "pair"), 0.59, tolerance = 0.01)
p3 <- predict(lss_test, cut = 0.5, rescale = FALSE, min_n = 10)
expect_true(min(p3, na.rm = TRUE) >= -1)
expect_true(max(p3, na.rm = TRUE) <= 1)
expect_equal(cor(p0, p3, use = "pair"), 0.73, tolerance = 0.01)
p4 <- predict(lss_test, cut = 0.75, rescale = FALSE, min_n = 10)
expect_true(min(p4, na.rm = TRUE) >= -1)
expect_true(max(p4, na.rm = TRUE) <= 1)
expect_equal(cor(p0, p4, use = "pair"), 0.33, tolerance = 0.01)
p5 <- predict(lss_test, cut = c(0.25, 0.75), rescale = FALSE, min_n = 10)
expect_true(min(p5, na.rm = TRUE) >= -1)
expect_true(max(p5, na.rm = TRUE) <= 1)
expect_equal(cor(p0, p5, use = "pair"), 0.77, tolerance = 0.01)
p6 <- predict(lss_test, cut = c(0.75, 0.25), rescale = FALSE, min_n = 10)
expect_identical(p5, p6)
expect_error(
predict(lss_test, cut = 1.5),
"The value of cut must be between 0 and 1"
)
expect_error(
predict(lss_test, cut = -0.1),
"The value of cut must be between 0 and 1"
)
expect_error(
predict(lss_test, cut = c(0.1, 0.5, 0.9)),
"The length of cut must be between 1 and 2"
)
expect_equal(
LSX:::cut_beta(c(1.1, -1.2, 0.5, 0.3, -0.2, -0.5)),
c(1, -1, 1, 1, -1, -1)
)
expect_equal(
LSX:::cut_beta(c(1.1, -1.2, 0.5, 0.3, -0.2, -0.5), c(0.2, 0.8)),
c(1, -1, 0, 0, 0, -1)
)
beta <- rnorm(nfeat(dfmt_test), sd = 0.1)
names(beta) <- featnames(dfmt_test)
beta2 <- LSX:::cut_beta(beta, c(0.2, 0.8))
lss1 <- as.textmodel_lss(beta)
lss2 <- as.textmodel_lss(beta2)
expect_equal(names(lss1$beta), names(lss2$beta))
pred0 <- predict(lss1, dfmt_test, se_fit = TRUE)
pred1 <- predict(lss1, dfmt_test, cut = c(0.2, 0.8), se_fit = TRUE)
pred2 <- predict(lss2, dfmt_test, se_fit = TRUE)
expect_equal(pred0$n, pred1$n)
expect_equal(pred0$n, pred2$n)
expect_equal(pred1$fit, pred2$fit)
})
test_that("rescaling still works", {
expect_warning({
p1 <- predict(lss_test, rescaling = TRUE)
})
expect_silent({
p2 <- predict(lss_test, rescale = TRUE)
})
expect_identical(p1, p2)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.