Nothing
test_that("tokens_segment works for sentences", {
txt <- c(d1 = "Sentence one. Second sentence is this one!\n
Here is the third sentence.",
d2 = "Only sentence of doc2? No there is another.")
corp <- corpus(txt, docvars = data.frame(title = factor(c("doc1", "doc2"))))
toks <- tokens(corp)
toks_sent <- tokens_segment(toks, "\\p{Sterm}", valuetype = "regex", pattern_position = "after")
expect_equal(ndoc(toks_sent), 5)
expect_equal(as.list(toks_sent)[4],
list(d2.1 = c("Only", "sentence", "of", "doc2", "?")))
expect_equal(docvars(toks_sent, "title"),
as.factor(c("doc1", "doc1", "doc1", "doc2", "doc2")))
})
test_that("tokens_segment works for delimiter", {
txt <- c(d1 = "Sentence one. Second sentence is this one!\n
Here is the third sentence.",
d2 = "Only sentence of doc2? No there is another.")
corp <- corpus(txt, docvars = data.frame(title = factor(c("doc1", "doc2"))))
toks <- tokens(corp)
toks_sent <- tokens_segment(toks, "[.!?]", valuetype = "regex", pattern_position = "after")
expect_equal(ndoc(toks_sent), 5)
expect_equal(as.list(toks_sent)[1],
list(d1.1 = c("Sentence", "one", ".")))
expect_equal(as.list(toks_sent)[4],
list(d2.1 = c("Only", "sentence", "of", "doc2", "?")))
expect_equal(docvars(toks_sent, "title"),
as.factor(c("doc1", "doc1", "doc1", "doc2", "doc2")))
})
test_that("tokens_segment works for delimiter with extract_pattern = TRUE", {
txt <- c(d1 = "Sentence one. Second sentence is this one!\n
Here is the third sentence.",
d2 = "Only sentence of doc2? No there is another.")
corp <- corpus(txt, docvars = data.frame(title = factor(c("doc1", "doc2"))))
toks <- tokens(corp)
toks_sent <- tokens_segment(toks, "[.!?]", valuetype = "regex",
extract_pattern = TRUE, pattern_position = "after")
expect_equal(ndoc(toks_sent), 5)
expect_equal(as.list(toks_sent)[1],
list(d1.1 = c("Sentence", "one")))
expect_equal(as.list(toks_sent)[4],
list(d2.1 = c("Only", "sentence", "of", "doc2")))
expect_equal(docvars(toks_sent, "title"),
as.factor(c("doc1", "doc1", "doc1", "doc2", "doc2")))
})
test_that("tokens_segment includes left-over text", {
txt <- c("This is the main. this is left-over")
toks <- tokens(txt)
toks_seg1 <- tokens_segment(toks, "[.!?]", valuetype = "regex",
extract_pattern = FALSE, pattern_position = "after")
expect_equal(as.list(toks_seg1)[2],
list(text1.2 = c("this", "is", "left-over")))
toks_seg2 <- tokens_segment(toks, "[.!?]", valuetype = "regex",
extract_pattern = FALSE, pattern_position = "after")
expect_equal(as.list(toks_seg2)[2],
list(text1.2 = c("this", "is", "left-over")))
})
test_that("tokens_segment works when removing punctuation match, remove_delimiter tests", {
toks1 <- tokens(c("This: is a test", "Another test"))
toks2 <- tokens(c("This is a test", "Another test."))
toks3 <- tokens(c("This is a test", "Another test"))
# extract_pattern = TRUE
expect_equal(
as.list(tokens_segment(toks1, "^\\p{P}$", valuetype = "regex",
extract_pattern = TRUE, pattern_position = "after")),
list(text1.1 = "This", text1.2 = c("is", "a", "test"), text2.1 = c("Another", "test"))
)
expect_equal(
as.list(tokens_segment(toks2, "^\\p{P}$", valuetype = "regex",
extract_pattern = TRUE, pattern_position = "after")),
list(text1 = c("This", "is", "a", "test"), text2 = c("Another", "test"))
)
expect_equal(
as.list(tokens_segment(toks3, "^\\p{P}$", valuetype = "regex",
extract_pattern = TRUE, pattern_position = "after")),
list(text1 = c("This", "is", "a", "test"), text2 = c("Another", "test"))
)
# extract_pattern = FALSE
expect_equal(
as.list(tokens_segment(toks1, "^\\p{P}$", valuetype = "regex",
extract_pattern = FALSE, pattern_position = "after")),
list(text1.1 = c("This", ":"), text1.2 = c("is", "a", "test"), text2.1 = c("Another", "test"))
)
expect_equal(
as.list(tokens_segment(toks2, "^\\p{P}$", valuetype = "regex",
extract_pattern = FALSE, pattern_position = "after")),
list(text1 = c("This", "is", "a", "test"), text2 = c("Another", "test", "."))
)
expect_silent(as.list(tokens_segment(toks2, "^\\p{P}$", valuetype = "regex",
extract_pattern = FALSE, pattern_position = "after")))
expect_equal(
as.list(tokens_segment(toks3, "^\\p{P}$", valuetype = "regex",
extract_pattern = FALSE, pattern_position = "after")),
list(text1 = c("This", "is", "a", "test"), text2 = c("Another", "test"))
)
})
test_that("tokens_segment works with tags", {
corp <- corpus(c(d1 = "__TEST__ One two __TEST2__ Three",
d2 = "__TEST3__ Four"),
docvars = data.frame(test = c("A", "B"), stringsAsFactors = FALSE))
toks <- tokens(corp, what = "word")
toks_seg1 <- tokens_segment(toks, "__[A-Z0-9]+__", valuetype = "regex",
pattern_position = "before", extract_pattern = TRUE, use_docvars = TRUE)
vars1 <- docvars(toks_seg1)
expect_equal(vars1$test, c("A", "A", "B"))
expect_equal(vars1$pattern, c("__TEST__", "__TEST2__", "__TEST3__"))
expect_equal(as.list(toks_seg1),
list(d1.1 = c("One", "two"), d1.2 = "Three", d2.1 = "Four"))
toks_seg2 <- tokens_segment(toks, "__[A-Z0-9]+__", valuetype = "regex",
pattern_position = "before", extract_pattern = FALSE, use_docvars = TRUE)
vars2 <- docvars(toks_seg2)
expect_equal(vars2$test, c("A", "A", "B"))
expect_equal(vars2$pattern, NULL)
expect_equal(as.list(toks_seg2),
list(d1.1 = c("__TEST__", "One", "two"),
d1.2 = c("__TEST2__", "Three"),
d2.1 = c("__TEST3__", "Four")))
toks_seg3 <- tokens_segment(toks, "__[A-Z0-9]+__", valuetype = "regex",
pattern_position = "before", extract_pattern = TRUE, use_docvars = FALSE)
vars3 <- docvars(toks_seg3)
expect_equal(vars3$test, NULL)
expect_equal(vars3$pattern, c("__TEST__", "__TEST2__", "__TEST3__"))
expect_equal(as.list(toks_seg3),
list(d1.1 = c("One", "two"), d1.2 = "Three", d2.1 = "Four"))
expect_equal(docid(toks_seg3),
factor(c("d1", "d1", "d2"), levels = c("d1", "d2")))
expect_equal(segid(toks_seg3),
c(1L, 2L, 1L))
toks_seg4 <- tokens_segment(toks, "__[A-Z0-9]+__", valuetype = "regex",
pattern_position = "before", extract_pattern = FALSE, use_docvars = FALSE)
vars4 <- docvars(toks_seg4)
expect_equal(vars4$test, NULL)
expect_equal(vars4$pattern, NULL)
expect_equal(as.list(toks_seg4),
list(d1.1 = c("__TEST__", "One", "two"),
d1.2 = c("__TEST2__", "Three"),
d2.1 = c("__TEST3__", "Four")))
expect_equal(docid(toks_seg4),
factor(c("d1", "d1", "d2"), levels = c("d1", "d2")))
expect_equal(segid(toks_seg4),
c(1L, 2L, 1L))
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.