Nothing
test_that("tokens_chunk works", {
txt <- c(d1 = "a b c d", d2 = "e f g", d3 = "")
corp <- corpus(txt, docvars = data.frame(title = c("title1", "title2", "title3"),
stringsAsFactors = FALSE))
toks <- tokens(corp)
toks_chunk1 <- tokens_chunk(toks, size = 5)
expect_identical(as.list(toks_chunk1),
list(d1 = c("a", "b", "c", "d"),
d2 = c("e", "f", "g")))
expect_identical(attr(toks_chunk1, "docvars"),
data.frame("docname_" = c("d1", "d2"),
"docid_" = factor(c("d1", "d2"),
levels = c("d1", "d2", "d3")),
"segid_" = c(1L, 1L),
"title" = c("title1", "title2"),
stringsAsFactors = FALSE))
toks_chunk2 <- tokens_chunk(toks, size = 2)
expect_identical(as.list(toks_chunk2),
list(d1.1 = c("a", "b"),
d1.2 = c("c", "d"),
d2.1 = c("e", "f"),
d2.2 = c("g")))
expect_identical(attr(toks_chunk2, "docvars"),
data.frame("docname_" = c("d1.1", "d1.2", "d2.1", "d2.2"),
"docid_" = factor(c("d1", "d1", "d2", "d2"),
levels = c("d1", "d2", "d3")),
"segid_" = c(1L, 2L, 1L, 2L),
"title" = c("title1", "title1", "title2", "title2"),
stringsAsFactors = FALSE,
check.names = FALSE))
toks_chunk3 <- tokens_chunk(toks, size = 2, use_docvars = FALSE)
expect_identical(as.list(toks_chunk3),
list(d1.1 = c("a", "b"),
d1.2 = c("c", "d"),
d2.1 = c("e", "f"),
d2.2 = c("g")))
expect_identical(attr(toks_chunk3, "docvars"),
data.frame("docname_" = c("d1.1", "d1.2", "d2.1", "d2.2"),
"docid_" = factor(c("d1", "d1", "d2", "d2"),
levels = c("d1", "d2", "d3")),
"segid_" = c(1L, 2L, 1L, 2L),
stringsAsFactors = FALSE,
check.names = FALSE))
toks_chunk4 <- tokens_chunk(toks, 2, overlap = 1)
expect_identical(as.list(toks_chunk4),
list(d1.1 = c("a", "b"),
d1.2 = c("b", "c"),
d1.3 = c("c", "d"),
d1.4 = c("d"),
d2.1 = c("e", "f"),
d2.2 = c("f", "g"),
d2.3 = c("g")))
expect_identical(attr(toks_chunk4, "docvars"),
data.frame("docname_" = c("d1.1", "d1.2", "d1.3", "d1.4", "d2.1", "d2.2", "d2.3"),
"docid_" = factor(c("d1", "d1", "d1", "d1", "d2", "d2", "d2"),
levels = c("d1", "d2", "d3")),
"segid_" = c(1L, 2L, 3L, 4L, 1L, 2L, 3L),
"title" = c("title1", "title1", "title1", "title1", "title2",
"title2", "title2"),
stringsAsFactors = FALSE,
check.names = FALSE))
})
test_that("tokens_chunk raises error for invalid size", {
toks <- tokens(c(d1 = "a b c d", d2 = "e f g"))
expect_error(tokens_chunk(toks, -1),
"The value of size must be between 1 and Inf")
expect_error(tokens_chunk(toks, 0),
"The value of size must be between 1 and Inf")
expect_error(tokens_chunk(toks, c(1, 3)),
"The length of size must be 1")
})
test_that("tokens_chunk raises error for invalid overlap", {
toks <- tokens(c(d1 = "a b c d", d2 = "e f g"))
expect_error(tokens_chunk(toks, 2, 2),
"The value of overlap must be smaller than size")
expect_error(tokens_chunk(toks, 2, -1),
"The value of overlap must be between 0 and Inf")
})
test_that("tokens_chunk works", {
toks <- tokens(c("a b c d e f", "a a b d c"))
expect_is(tokens_chunk(toks, size = 3), "tokens")
expect_equivalent(
as.list(tokens_chunk(toks, 3)),
list(c("a", "b", "c"), c("d", "e", "f"), c("a", "a", "b"), c("d", "c"))
)
expect_identical(
attr(tokens_chunk(toks, 3), "docvars"),
data.frame("docname_" = c("text1.1", "text1.2", "text2.1", "text2.2"),
"docid_" = factor(c("text1", "text1", "text2", "text2"),
levels = c("text1", "text2")),
"segid_" = c(1L, 2L, 1L, 2L),
stringsAsFactors = FALSE,
check.names = FALSE)
)
})
test_that("tokens_chunk() works with sizes longer than tokens length", {
toks <- tokens(c(d1 = "a b c d e", d2 = "a b c"))
expect_identical(
as.list(tokens_chunk(toks, size = 4)),
list(d1.1 = c("a", "b", "c", "d"),
d1.2 = "e",
d2.1 = c("a", "b", "c"))
)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.