tests/testthat/test-tokens-word1.R

test_that("tokens works for strange spaces (#796)", {
    txt <- "space tab\t newline\n non-breakingspace\u00A0, variationselector16 \uFE0F."
    toks <- tokens(txt, what = "word1", remove_punct = FALSE, remove_separators = FALSE)
    expect_identical(ntoken(toks), c(text1 = 15L))
    expect_identical(
        as.character(tokens(txt, what = "word1", remove_punct = FALSE, remove_separators = FALSE))[13:15],
        c("variationselector16", " ", ".")
    )
    expect_identical(
        ntoken(txt, remove_punct = TRUE, remove_separators = FALSE, what = "word1"),
        c(text1 = 13L)
    )
    expect_identical(
        as.character(tokens(txt, remove_punct = TRUE, remove_separators = FALSE,
                            what = "word1"))[12:13],
        c("variationselector16", " ")
    )
})

test_that("output is correct for word1", {
    expect_message(
        tmp <- tokens(data_char_ukimmig2010, what = "word1", split_hyphens = FALSE, verbose = TRUE),
        "preserving hyphens"
    )
    expect_message(
        tmp <- tokens(data_char_ukimmig2010, what = "word1", split_hyphens = FALSE, verbose = TRUE),
        "Finished constructing tokens from 9 documents"
    )
    expect_message(
        tmp <- tokens(data_char_ukimmig2010, what = "word1", split_hyphens = FALSE, verbose = TRUE),
        "^Creating a tokens object from a character input"
    )
})

test_that("symbols and punctuation are handled separately (#1445)", {
    txt <- "£ € 👏 Rock on❗ 💪️🎸"
    expect_identical(
        as.character(tokens(txt, what = "word1", remove_symbols = FALSE, remove_punct = TRUE)),
        as.character(tokens(txt, what = "word1", remove_symbols = FALSE, remove_punct = FALSE))
    )
    expect_identical(
        as.character(tokens(txt, what = "fasterword", remove_symbols = FALSE, remove_punct = TRUE)),
        as.character(tokens(txt, what = "fasterword", remove_symbols = FALSE, remove_punct = FALSE))
    )
    expect_identical(
        as.character(tokens(txt, what = "fastestword", remove_symbols = FALSE, remove_punct = TRUE)),
        as.character(tokens(txt, what = "fastestword", remove_symbols = FALSE, remove_punct = FALSE))
    )
})

Try the quanteda package in your browser

Any scripts or data that you put into this service are public.

quanteda documentation built on May 31, 2023, 8:28 p.m.