txt <- c("PAGE 1. This is a single sentence. Short sentence. Three word sentence.",
"PAGE 2. Very short! Shorter.",
"Very long sentence, with multiple parts, separated by commas. PAGE 3.")
test_that("corpus_trim works", {
corp <- corpus(txt, docvars = data.frame(serial = 1:3))
expect_equal(
as.character(corpus_trim(corp, "sentences", min_ntok = 3)),
c(text1 = "This is a single sentence. Three word sentence.",
text3 = "Very long sentence, with multiple parts, separated by commas.")
)
expect_equal(
as.character(corpus_trim(corp, "sentences", exclude_pattern = "^PAGE \\d+")),
c(text1 = "This is a single sentence. Short sentence. Three word sentence.",
text2 = "Very short! Shorter.",
text3 = "Very long sentence, with multiple parts, separated by commas.")
)
expect_equal(
as.character(corpus_trim(corp, "documents", min_ntoken = 8)),
c(text1 = txt[1], text3 = txt[3])
)
})
test_that("corpus_trim returns empty corpus when appropriate", {
corp <- corpus(txt, docvars = data.frame(serial = 1:3))
tmp <- corpus_trim(corp, "documents", min_ntoken = 50)
expect_equal(ndoc(tmp), 0)
expect_is(tmp, "corpus")
expect_true(is.corpus(tmp))
})
test_that("char_trim works", {
expect_equal(
char_trim(txt, "sentences", min_ntok = 3),
c(text1 = "This is a single sentence. Three word sentence.",
text3 = "Very long sentence, with multiple parts, separated by commas.")
)
expect_equal(
char_trim(txt, "sentences", exclude_pattern = "^PAGE \\d+"),
c(text1 = "This is a single sentence. Short sentence. Three word sentence.",
text2 = "Very short! Shorter.",
text3 = "Very long sentence, with multiple parts, separated by commas.")
)
expect_equal(
char_trim(txt, "documents", min_ntoken = 8),
c(text1 = txt[1], text3 = txt[3])
)
})
test_that("char_trim returns empty character when appropriate", {
tmp <- char_trim(txt, "documents", min_ntoken = 50)
expect_equal(length(tmp), 0)
expect_is(tmp, "character")
expect_true(is.character(tmp))
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.