test_that("test that ngrams produces the results from Guthrie 2006", {
toks <- tokens(c('insurgents killed in ongoing fighting'))
bi_grams <- c('insurgents_killed', 'killed_in', 'in_ongoing',
'ongoing_fighting')
two_skip_bi_grams <- c('insurgents_killed', 'insurgents_in',
'insurgents_ongoing', 'killed_in', 'killed_ongoing', 'killed_fighting',
'in_ongoing', 'in_fighting', 'ongoing_fighting')
tri_grams <- c('insurgents_killed_in', 'killed_in_ongoing',
'in_ongoing_fighting')
two_skip_tri_grams <- c('insurgents_killed_in',
'insurgents_killed_ongoing', 'insurgents_killed_fighting',
'insurgents_in_ongoing', 'insurgents_in_fighting',
'insurgents_ongoing_fighting', 'killed_in_ongoing',
'killed_in_fighting', 'killed_ongoing_fighting',
'in_ongoing_fighting')
expect_setequal(
as.list(tokens_ngrams(toks, n=2, skip=0))[[1]],
bi_grams
)
expect_setequal(
as.list(tokens_ngrams(toks, n=2, skip=0:2))[[1]],
two_skip_bi_grams
)
expect_setequal(
as.list(tokens_ngrams(toks, n=3, skip=0))[[1]],
tri_grams
)
expect_setequal(
as.list(tokens_ngrams(toks, n=3, skip=0:2))[[1]],
two_skip_tri_grams
)
expect_setequal(
as.list(tokens_ngrams(toks, n = 2:3))[[1]],
c(bi_grams, tri_grams)
)
expect_setequal(
as.list(suppressWarnings(tokens_ngrams(toks, n = 2:3)))[[1]],
c(bi_grams, tri_grams)
)
})
test_that("char_ngrams works", {
expect_equivalent(
char_ngrams(c('insurgents','killed', 'in', 'ongoing', 'fighting')),
c('insurgents_killed', 'killed_in', 'in_ongoing', 'ongoing_fighting')
)
expect_warning(char_ngrams(c('insurgents killed', 'in', 'ongoing', 'fighting')),
"whitespace detected: you may need to run tokens\\(\\) first")
})
test_that("token_skipgrams works", {
toks <- tokens("insurgents killed in ongoing fighting")
# ASCII concatenator
ngms1 <- tokens_skipgrams(toks, n = 2, skip = 0:1, concatenator = " ")
expect_equivalent(
as.list(ngms1)[[1]],
c('insurgents killed', "insurgents in", "killed in" , "killed ongoing" ,
"in ongoing", "in fighting", "ongoing fighting")
)
expect_equal(Encoding(types(ngms1)), rep("unknown", 7))
# Unicode concatenator
ngms2 <- tokens_skipgrams(toks, n = 2, skip = 0:1, concatenator = "\u00a0")
expect_equivalent(
as.list(ngms2)[[1]],
c('insurgents\u00a0killed', "insurgents\u00a0in", "killed\u00a0in" , "killed\u00a0ongoing" ,
"in\u00a0ongoing", "in\u00a0fighting", "ongoing\u00a0fighting")
)
expect_equal(Encoding(types(ngms2)), rep("UTF-8", 7))
})
test_that("raises error for invalid inputs", {
toks <- tokens("insurgents killed in ongoing fighting")
expect_error(
tokens_ngrams(toks, n = 0),
"The value of n must be between 1 and Inf"
)
expect_error(
tokens_ngrams(toks, skip = -1),
"The value of skip must be between 0 and Inf"
)
expect_error(
tokens_ngrams(toks, concatenator = character()),
"The length of concatenator must be 1"
)
})
test_that("tokens_ngrams does nothing when n = 1 and skip = 0 (#1395)", {
toks <- tokens("insurgents killed in ongoing fighting")
expect_identical(tokens_ngrams(toks, n = 1, skip = 0, concatenator = " "), toks)
})
test_that("test there is no competition between threads", {
skip_on_cran()
skip_on_travis()
skip_on_appveyor()
# increase the chance to generate the same ngram by duplicating texts
txt <- char_tolower(rep("Insurgents killed in ongoing fighting.", 10))
toks <- tokens(txt, remove_punct = TRUE)
ngrs <- rep(list(c("insurgents_killed", "killed_in", "in_ongoing", "ongoing_fighting",
"insurgents_killed_in", "killed_in_ongoing", "in_ongoing_fighting")), 10)
names(ngrs) <- names(toks)
# needs to be repeated because thread competition happen at low chance
expect_true(
all(replicate(1000, identical(as.list(tokens_ngrams(toks, n = 2:3)), ngrs)))
)
})
test_that("tokens_ngrams(x, n = ...) works when ntokens(x) < n", {
## issue #392
expect_equivalent(unclass(as.list(tokens_ngrams(tokens("a"), n = 2)))[[1]],
char_ngrams("a", n = 2))
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.