Nothing
txt <- c(doc1 = "aa bb BB cc DD ee",
doc2 = "aa bb cc DD ee")
toks_test <- tokens(txt)
test_that("tokens_replace works with regular pattern and replacement", {
# equivalent to tokens conversion method
expect_equal(as.list(tokens_replace(toks_test, types(toks_test), char_toupper(types(toks_test)), "fixed", case_insensitive = FALSE)),
as.list(tokens_toupper(toks_test)))
# fixed, case-insensitive
expect_equal(as.list(tokens_replace(toks_test, c('aa', 'bb'), c('a', 'b'), "fixed", case_insensitive = TRUE)),
list(doc1 = c("a", "b", "b", "cc", "DD", "ee"),
doc2 = c("a", "b", "cc", "DD", "ee")))
# fixed, case-sensitive
expect_equal(as.list(tokens_replace(toks_test, c('aa', 'bb'), c('a', 'b'), "fixed", case_insensitive = FALSE)),
list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"),
doc2 = c("a", "b", "cc", "DD", "ee")))
# fixed, duplicated types in from
expect_equal(as.list(tokens_replace(toks_test, c('aa', 'aa'), c('a', 'aaa'), "fixed", case_insensitive = FALSE)),
list(doc1 = c("a", "bb", "BB", "cc", "DD", "ee"),
doc2 = c("a", "bb", "cc", "DD", "ee")))
# fixed, no match
expect_equal(as.list(tokens_replace(toks_test, "x", "y", "fixed", case_insensitive = FALSE)),
as.list(toks_test))
# glob, case-insensitive
expect_equal(as.list(tokens_replace(toks_test, c('a*', 'b*'), c('a', 'b'), "glob", case_insensitive = TRUE)),
list(doc1 = c("a", "b", "b", "cc", "DD", "ee"),
doc2 = c("a", "b", "cc", "DD", "ee")))
# glob, case-sensitive
expect_equal(as.list(tokens_replace(toks_test, c('a*', 'b*'), c('a', 'b'), "glob", case_insensitive = FALSE)),
list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"),
doc2 = c("a", "b", "cc", "DD", "ee")))
# glob, duplicated types in from
expect_equal(as.list(tokens_replace(toks_test, c('a*', 'a*'), c('a', 'aaa'), "glob", case_insensitive = FALSE)),
list(doc1 = c("a", "bb", "BB", "cc", "DD", "ee"),
doc2 = c("a", "bb", "cc", "DD", "ee")))
# glob, no match
expect_equal(as.list(tokens_replace(toks_test, "x*", "y", "glob", case_insensitive = FALSE)),
as.list(toks_test))
# regex, case-insensitive
expect_equal(as.list(tokens_replace(toks_test, c('a.', 'b.'), c('a', 'b'), "regex", case_insensitive = TRUE)),
list(doc1 = c("a", "b", "b", "cc", "DD", "ee"),
doc2 = c("a", "b", "cc", "DD", "ee")))
# regex, case-sensitive
expect_equal(as.list(tokens_replace(toks_test, c('a.', 'b.'), c('a', 'b'), "regex", case_insensitive = FALSE)),
list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"),
doc2 = c("a", "b", "cc", "DD", "ee")))
# regex, duplicated types in from
expect_equal(as.list(tokens_replace(toks_test, c('a.', 'a.'), c('a', 'aaa'), "regex", case_insensitive = FALSE)),
list(doc1 = c("a", "bb", "BB", "cc", "DD", "ee"),
doc2 = c("a", "bb", "cc", "DD", "ee")))
# regex, no match
expect_equal(as.list(tokens_replace(toks_test, "x.*", "y", "regex", case_insensitive = FALSE)),
as.list(toks_test))
})
test_that("tokens_replace works with pharsal pattern and replacement", {
# fixed, case-insensitive
expect_equal(as.list(tokens_replace(toks_test, phrase(c('aa bb')), phrase(c('a b')), "fixed",
case_insensitive = TRUE)),
list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"),
doc2 = c("a", "b", "cc", "DD", "ee")))
# fixed, single, case-sensitive
expect_equal(as.list(tokens_replace(toks_test, phrase(c('AA BB')), phrase(c('a b')), "fixed",
case_insensitive = FALSE)),
list(doc1 = c("aa", "bb", "BB", "cc", "DD", "ee"),
doc2 = c("aa", "bb", "cc", "DD", "ee")))
# fixed, single, duplicated types in from
expect_equal(as.list(tokens_replace(toks_test, phrase(c('aa bb', 'aa bb')), phrase(c('a b', 'aaa bbb')), "fixed",
case_insensitive = FALSE)),
list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"),
doc2 = c("a", "b", "cc", "DD", "ee")))
# glob, case-insensitive
expect_equal(as.list(tokens_replace(toks_test, phrase(c('aa *')), phrase(c('a b')), "glob", case_insensitive = TRUE)),
list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"),
doc2 = c("a", "b", "cc", "DD", "ee")))
# glob, case-sensitive
expect_equal(as.list(tokens_replace(toks_test, phrase(c('AA *')), phrase(c('a b')), "glob", case_insensitive = FALSE)),
list(doc1 = c("aa", "bb", "BB", "cc", "DD", "ee"),
doc2 = c("aa", "bb", "cc", "DD", "ee")))
# glob, duplicated types in from
expect_equal(as.list(tokens_replace(toks_test, phrase(c('aa *', 'aa bb')), phrase(c('a b', 'aaa bbb')), "glob",
case_insensitive = FALSE)),
list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"),
doc2 = c("a", "b", "cc", "DD", "ee")))
# regex, case-insensitive
expect_equal(as.list(tokens_replace(toks_test, phrase(c('aa .*')), phrase(c('a b')), "regex", case_insensitive = TRUE)),
list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"),
doc2 = c("a", "b", "cc", "DD", "ee")))
# regex, case-sensitive
expect_equal(as.list(tokens_replace(toks_test, phrase(c('AA .*')), phrase(c('a b')), "regex", case_insensitive = FALSE)),
list(doc1 = c("aa", "bb", "BB", "cc", "DD", "ee"),
doc2 = c("aa", "bb", "cc", "DD", "ee")))
# regex, duplicated types in from
expect_equal(as.list(tokens_replace(toks_test, phrase(c('aa .*', 'aa bb')), phrase(c('a b', 'aaa bbb')), "regex",
case_insensitive = FALSE)),
list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"),
doc2 = c("a", "b", "cc", "DD", "ee")))
})
test_that("tokens_replace can reconstruct original by re-replacement", {
toks <- tokens(data_corpus_inaugural)
toks_temp1 <- tokens_replace(toks, "it", "ZZZZZZZ", case_insensitive = FALSE)
expect_equal(as.list(tokens_replace(toks_temp1, "ZZZZZZZ", "it")),
as.list(toks))
toks_temp2 <- tokens_replace(toks, phrase("it is"), phrase("XXXXXXX YYYYYYY ZZZZZZZ"), case_insensitive = FALSE)
expect_equal(as.list(tokens_replace(toks_temp2, phrase("XXXXXXX YYYYYYY ZZZZZZZ"), phrase("it is"))),
as.list(toks))
})
test_that("tokens_replace raises error when input values are invalid", {
# error when lenfths of from and to are different
expect_error(tokens_replace(toks_test, c('aa', 'bb'), c('a')),
"The length of pattern and replacement must be the same")
expect_error(tokens_replace(toks_test, c(1, 2), c(10, 20), valuetype = "fixed"),
"The type of x must be character")
# does nothing when input vector is zero length
expect_equal(tokens_replace(toks_test, character(), character()),
toks_test)
})
test_that("tokens_replace remove tokens when replacement is empty", {
pat1 <- list(c("aa", "bb"), c("dd", "ee"))
rep1 <- list(character(), c("zz", "zz"))
pat2 <- list(c("aa", "bb"), c("bb", "cc"))
rep2 <- list(c("zz", "zz"), character())
expect_equal(as.list(tokens_replace(toks_test, pat1, rep1)),
list(doc1 = c("BB", "cc", "zz", "zz"),
doc2 = c("cc","zz", "zz")))
expect_equal(as.list(tokens_replace(toks_test, pat2, rep2)),
list(doc1 = c("zz", "zz", "DD", "ee"),
doc2 = c("zz","zz", "DD", "ee")))
})
test_that("tokens_replace works with same replacement characters (#1765)", {
exp1 <- list(doc1 = c("bb", "bb", "BB", "bb", "DD", "ee"),
doc2 = c("bb", "bb", "bb", "DD", "ee"))
expect_equal(
as.list(tokens_replace(toks_test, c("aa", "cc"), c("bb", "bb"), valuetype = "fixed")),
exp1
)
expect_equal(
as.list(tokens_replace(toks_test, c("aa", "cc"), c("bb", "bb"), valuetype = "glob")),
exp1
)
expect_equal(
as.list(tokens_replace(toks_test, c("a.*", "cc"), c("bb", "bb"), valuetype = "regex")),
exp1
)
exp2 <- list(doc1 = c("bb", "BB", "bb", "ee"),
doc2 = c("bb", "bb", "ee"))
expect_equal(
as.list(tokens_replace(toks_test, list(c("aa", "bb"), c("cc", "DD")),
c("bb", "bb"), valuetype = "fixed")),
exp2
)
expect_equal(
as.list(tokens_replace(toks_test, list(c("aa", "bb"), c("cc", "DD")),
c("bb", "bb"), valuetype = "glob")),
exp2
)
expect_equal(
as.list(tokens_replace(toks_test, list(c("a.*", "bb"), c("cc", "DD")),
c("bb", "bb"), valuetype = "regex")),
exp2
)
exp3 <- list(doc1 = c("zz", "bb", "BB", "zz", "DD", "ee"),
doc2 = c("zz", "bb", "zz", "DD", "ee"))
expect_equal(
as.list(tokens_replace(toks_test, c("aa", "cc"), c("zz", "zz"), valuetype = "fixed")),
exp3
)
expect_equal(
as.list(tokens_replace(toks_test, c("aa", "cc"), c("zz", "zz"), valuetype = "glob")),
exp3
)
expect_equal(
as.list(tokens_replace(toks_test, c("a.*", "cc"), c("zz", "zz"), valuetype = "regex")),
exp3
)
exp4 <- list(doc1 = c("zz", "BB", "zz", "ee"),
doc2 = c("zz", "zz", "ee"))
expect_equal(
as.list(tokens_replace(toks_test, list(c("aa", "bb"), c("cc", "DD")),
c("zz", "zz"), valuetype = "glob")),
exp4
)
expect_equal(
as.list(tokens_replace(toks_test, list(c("aa", "bb"), c("cc", "DD")),
c("zz", "zz"), valuetype = "glob")),
exp4
)
expect_equal(
as.list(tokens_replace(toks_test, list(c("a.*", "bb"), c("cc", "DD")),
c("zz", "zz"), valuetype = "regex")),
exp4
)
})
test_that("apply_if argument is working", {
dat <- data.frame(text = c("C language",
"Vitamin C"),
topic = c("language", "vitamin"))
corp <- corpus(dat)
toks <- tokens(corp)
toks1 <- tokens_replace(toks, "C", "R", apply_if = toks$topic == "language")
expect_identical(
as.list(toks1),
list(text1 = c("R", "language"),
text2 = c("Vitamin", "C"))
)
toks2 <- tokens_replace(toks, "C", "D", apply_if = toks$topic == "vitamin")
expect_identical(
as.list(toks2),
list(text1 = c("C", "language"),
text2 = c("Vitamin", "D"))
)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.