tests/testthat/test-as.textmodel.R

require(quanteda)

mat_test <- readRDS("../data/matrix_embedding.RDS")
toks_test <- readRDS("../data/tokens_test.RDS")
feat_test <- head(char_context(toks_test, "america*", min_count = 1, p = 0.05), 100)
dfmt_test <- dfm(toks_test)
seed <- as.seedwords(data_dictionary_sentiment)
lss_test <- textmodel_lss(dfmt_test, seed, terms = feat_test, k = 50,
                          include_data = FALSE)

test_that("as.textmodel_lss works with matrix", {

    term <- c("decision", "instance", "universal", "foundations", "the")

    # with terms
    lss1 <- as.textmodel_lss(mat_test, seed, term)
    expect_equal(names(lss1), names(LSX:::build_lss()))
    expect_identical(lss1$embedding, mat_test)
    expect_false(any(duplicated(names(coef(lss1)))))
    pred1 <- predict(lss1, dfmt_test)
    expect_equal(names(pred1), rownames(dfmt_test))
    expect_equal(rowSums(dfmt_test[,names(lss1$beta)]) == 0,
                 is.na(pred1))

    # without terms
    lss2 <- as.textmodel_lss(mat_test, seed)
    expect_equal(names(lss2), names(LSX:::build_lss()))
    expect_identical(lss2$embedding, mat_test)
    expect_false(any(duplicated(names(coef(lss2)))))
    pred2 <- predict(lss2, dfmt_test)
    expect_equal(names(pred2), rownames(dfmt_test))
    expect_equal(rowSums(dfmt_test[,names(lss2$beta)]) == 0,
                 is.na(pred2))

    # with special features
    mat_special <- mat_test
    colnames(mat_special)[1:2] <- c("", "*")
    lss3 <- as.textmodel_lss(mat_special, seed)
    expect_equal(sum("" == names(coef(lss3))), 0)
    expect_equal(sum("*" == names(coef(lss3))), 1)

    # with slice
    lss4 <- as.textmodel_lss(mat_test, seed, slice = 50)
    expect_error(
        as.textmodel_lss(mat_test, seed, slice = 150),
        "The value of slice must be between 1 and 100"
    )
    expect_error(
        as.textmodel_lss(mat_test, seed, slice = 1:150),
        "The length of slice must be between 1 and 100"
    )
    expect_identical(coef(lss4),
                     coef(as.textmodel_lss(mat_test, seed, slice = 1:50)))
    expect_identical(lss4$embedding, mat_test)
})

test_that("as.textmodel_lss errors with invalid columns", {
    seed <- as.seedwords(data_dictionary_sentiment)
    mat_nocol <- mat_nacol <- mat_na <- mat_test
    colnames(mat_nocol) <- NULL
    expect_error(as.textmodel_lss(mat_nocol, seed),
                 "x must have column names for features")
    colnames(mat_nacol)[1] <- NA
    expect_error(as.textmodel_lss(mat_nacol, seed),
                 "x must not have NA in the column names")
    mat_na[1,1] <- NA
    expect_error(as.textmodel_lss(mat_na, seed),
                 "x must not have NA")
})

test_that("as.textmodel_lss works with textmodel_lss", {

    # with fitted model
    lss <- as.textmodel_lss(lss_test, seed, terms = feat_test, slice = 10)
    expect_equal(lss$embedding, lss_test$embedding)
    expect_identical(lss$data, lss_test$data)
    expect_identical(lss$frequency, lss_test$frequency)
    expect_identical(names(lss$frequency), names(lss$frequency))

    expect_error(
        as.textmodel_lss(lss_test, seed, slice = 100),
        "The value of slice must be between 1 and 50"
    )
    expect_error(
        as.textmodel_lss(lss_test, seed, slice = 1:100),
        "The length of slice must be between 1 and 50"
    )

    # with dummy LSS
    weight <- c("decision" = 0.1, "instance" = -0.1,
                "foundations" = 0.3, "the" = 0)
    lss_dummy <- as.textmodel_lss(weight)
    expect_error(
        as.textmodel_lss(lss_dummy, seed),
        "x must be a valid textmodel_lss object"
    )
})

test_that("as.textmodel_lss works with vector", {
    weight <- c("decision" = 0.1, "instance" = -0.1,
                "foundations" = 0.3, "the" = 0)
    lss <- as.textmodel_lss(weight)
    expect_equal(names(lss), names(LSX:::build_lss()))
    pred <- predict(lss, dfmt_test)
    expect_equal(names(pred), rownames(dfmt_test))
    expect_equal(rowSums(dfmt_test[,names(lss$beta)]) == 0,
                 is.na(pred))
})

test_that("as.textmodel_lss errors with vector", {
    weight <- c("decision" = 0.1, "instance" = -0.1,
                "foundations" = 0.3, "the" = 0)
    weight_noname <- weight_naname <- weight_na <- weight
    names(weight_noname) <- NULL
    expect_error(as.textmodel_lss(weight_noname),
                 "x must have names for features")
    names(weight_naname)[1] <- NA
    expect_error(as.textmodel_lss(weight_naname),
                 "x must not have NA in the names")
    weight_na[1] <- NA
    expect_error(as.textmodel_lss(weight_na),
                 "x must not have NA")
})

test_that("auto_weight is working", {
    skip_on_cran()

    lss1 <- as.textmodel_lss(mat_test, seed)
    lss2 <- as.textmodel_lss(mat_test, seed, auto_weight = TRUE)
    expect_true(
        all(lss1$seeds_weighted != lss2$seeds_weighted)
    )
    expect_true(
        all(sign(lss1$seeds_weighted) == sign(lss2$seeds_weighted))
    )
    expect_true(
        all(abs(lss2$beta[names(lss2$seeds_weighted)] - lss1$seeds_weighted) < 0.05)
    )
    expect_output(
        as.textmodel_lss(mat_test, seed, auto_weight = TRUE, verbose = TRUE),
        "Optimizing seed weights..."
    )
})

test_that("terms is working", {
    skip_on_cran()

    lss <- textmodel_lss(dfmt_test, seed, k = 50)

    # glob pattern
    lss1 <- as.textmodel_lss(lss, seed, terms = "poli*")
    expect_equal(sum(stringi::stri_startswith_fixed(names(lss1$beta), "poli")), 11)
    expect_identical(names(lss1$beta), names(lss1$frequency))

    # numeric vector
    weight <- sample(1:10, length(lss1$beta), replace = TRUE) / 10
    names(weight) <- names(lss1$beta)
    lss2 <- as.textmodel_lss(lss, seed, terms = weight)
    expect_true(all(lss2$beta == lss1$beta * weight))
    expect_error(as.textmodel_lss(lss, seed, terms = c("polity" = 0.2, "politic" = -0.1)),
                 "terms must be positive values without NA")
    expect_error(as.textmodel_lss(lss, seed, terms = c("polity" = 0.2, "politic" = NA)),
                 "terms must be positive values without NA")
    expect_error(as.textmodel_lss(lss, seed, terms = c(01, 0.2)),
                 "terms must be named")

})
koheiw/LSS documentation built on March 9, 2024, 4:41 a.m.