tests/testthat/test-nfunctions.R

test_that("test ntoken on sentences", {
    txt <- c(doc1 = "This is Mr. Smith.  He is married to Mrs. Jones.",
             doc2 = "Never, before: a colon!  Gimme a break.")
    expect_identical(
        ntoken(tokens(txt, what = "sentence")),
        c(doc1 = 2L, doc2 = 2L)
    )
})

test_that("test ntype with dfm (#748)", {
    d <- dfm(tokens(c(doc1 = "one two three",
                      doc2 = "one one one")))
    expect_identical(
        ntype(d),
        c(doc1 = 3L, doc2 = 1L)
    )
    expect_identical(
        ntoken(d),
        c(doc1 = 3L, doc2 = 3L)
    )
})

# test_that("cannot call ntoken on a weighted dfm", {
#     d <- dfm(c(doc1 = "one two three", doc2 = "one one one")) |>
#         dfm_weight(scheme = "prop")
#     expect_error(
#         ntoken(d),
#         "cannot count the tokens in a weighted dfm - use colSums\\(\\) instead"
#     )
# })

test_that("test ntoken.tokens", {
    txt <- c(d1 = "a b c a b c", 
             d2 = "a b c d e")
    corp <- corpus(txt)
    toks <- tokens(corp)
    toks2 <- tokens_remove(toks, "a", padding = TRUE)
    
    expect_identical(ntoken(toks), c(d1 = 6L, d2 = 5L))
    expect_identical(ntoken(toks, remove_padding = TRUE), c(d1 = 6L, d2 = 5L))
    expect_identical(ntoken(toks2, remove_padding = TRUE), c(d1 = 4L, d2 = 4L))
    expect_error(
        ntoken(toks2, remove_padding = c(TRUE, FALSE)),
        "The length of remove_padding must be 1"
    )
})

test_that("test ntype.tokens", {
    txt <- c(d1 = "a b c a b c", 
             d2 = "a b c d e")
    corp <- corpus(txt)
    toks <- tokens(corp)
    toks2 <- tokens_remove(toks, "a", padding = TRUE)
    
    expect_identical(ntype(toks), c(d1 = 3L, d2 = 5L))
    expect_identical(ntype(toks, remove_padding = TRUE), c(d1 = 3L, d2 = 5L))
    expect_identical(ntype(toks2, remove_padding = TRUE), c(d1 = 2L, d2 = 4L))
    expect_error(
        ntype(toks2, remove_padding = c(TRUE, FALSE)),
        "The length of remove_padding must be 1"
    )
})

test_that("dots are applied in ntokens.tokens, ntype.tokens", {
    txt <- c(d1 = "3 wonderful tokens of the tokens function.")
    toks <- tokens(txt)

    expect_identical(ntoken(toks), c(d1 = 8L))
    expect_identical(ntoken(toks, remove_punct = TRUE), c(d1 = 7L))
    expect_identical(ntoken(toks, remove_punct = TRUE, remove_numbers = TRUE), c(d1 = 6L))
    expect_warning(ntoken(toks, notarg = TRUE), "^notarg argument is not used")

    expect_identical(ntype(toks), c(d1 = 7L))
    expect_identical(ntype(toks, remove_punct = TRUE), c(d1 = 6L))
    expect_identical(ntype(toks, remove_punct = TRUE, remove_numbers = TRUE), c(d1 = 5L))
    expect_warning(ntype(toks, notarg = TRUE), "^notarg argument is not used")

    suppressWarnings(expect_identical(ntype(txt, remove_punct = TRUE), c(d1 = 6L)))
    expect_identical(ntype(txt), c(d1 = 7L))
})

test_that("test nsentence", {
    txt <- c(doc1 = "This is Mr. Smith.  He is married to Mrs. Jones.",
             doc2 = "Never, before: a colon!  Gimme a break.")
    suppressWarnings(expect_identical(nsentence(txt), c(doc1 = 2L, doc2 = 2L)))
    expect_identical(nsentence(corpus(txt)), c(doc1 = 2L, doc2 = 2L))
    expect_identical(
        nsentence(tokens(txt, what = "sentence")),
        c(doc1 = 2L, doc2 = 2L)
    )
})

test_that("nsentence warnings work", {
    txt <- c(d1 = "one two three")
    expect_warning(
        nsentence(txt),
        "nsentence() does not correctly count sentences in all lower-cased text",
        fixed = TRUE
    )
    expect_warning(
        nsentence(corpus(txt)),
        "nsentence() does not correctly count sentences in all lower-cased text",
        fixed = TRUE
    )
})
quanteda/quanteda documentation built on April 15, 2024, 7:59 a.m.