test_that("tokens_compound join tokens correctly", {
txt <- c("a b c d e f g", "A B C D E F G", "A b C d E f G",
"aaa bbb ccc ddd eee fff ggg", "a_b b_c c_d d_e e_f f_g")
toks <- tokens(txt)
seqs <- as.list(tokens(c("a b", "C D", "aa* bb*", "eEE FFf", "d_e e_f"),
what = "fastestword"))
expect_equivalent(
as.list(tokens_compound(toks, seqs, valuetype = "glob", case_insensitive = TRUE)),
list(c("a_b", "c_d", "e", "f", "g"),
c("A_B", "C_D", "E", "F", "G"),
c("A_b", "C_d", "E", "f", "G"),
c("aaa_bbb", "ccc", "ddd", "eee_fff", "ggg"),
c("a_b", "b_c", "c_d", "d_e_e_f", "f_g"))
)
expect_equivalent(
as.list(tokens_compound(toks, seqs, valuetype = "glob", case_insensitive = FALSE)),
list(c("a_b", "c", "d", "e", "f", "g"),
c("A", "B", "C_D", "E", "F", "G"),
c("A", "b", "C", "d", "E", "f", "G"),
c("aaa_bbb", "ccc", "ddd", "eee", "fff", "ggg"),
c("a_b", "b_c", "c_d", "d_e_e_f", "f_g"))
)
seqs_fixed <- as.list(tokens(c("a b", "C D", "aa bb", "eEE FFf", "d_e e_f"),
what = "fastestword"))
expect_equivalent(
as.list(tokens_compound(toks, seqs_fixed, valuetype = "glob", case_insensitive = TRUE)),
list(c("a_b", "c_d", "e", "f", "g"),
c("A_B", "C_D", "E", "F", "G"),
c("A_b", "C_d", "E", "f", "G"),
c("aaa", "bbb", "ccc", "ddd", "eee_fff", "ggg"),
c("a_b", "b_c", "c_d", "d_e_e_f", "f_g"))
)
expect_equivalent(
as.list(tokens_compound(toks, seqs_fixed, valuetype = "glob", case_insensitive = FALSE)),
list(c("a_b", "c", "d", "e", "f", "g"),
c("A", "B", "C_D", "E", "F", "G"),
c("A", "b", "C", "d", "E", "f", "G"),
c("aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg"),
c("a_b", "b_c", "c_d", "d_e_e_f", "f_g"))
)
})
test_that("tokens_compound join a sequences of sequences", {
txt <- c("a b c d e f g", "A B C D E F G")
toks <- tokens(txt)
seqs <- as.list(tokens(c("a b", "b c d", "E F", "F G"),
what = "fastestword"))
expect_equal(
as.list(tokens_compound(toks, seqs, valuetype = "glob", case_insensitive = TRUE, join = TRUE)),
list(text1 = c("a_b_c_d", "e_f_g"),
text2 = c("A_B_C_D", "E_F_G"))
)
expect_equal(
as.list(tokens_compound(toks, seqs, valuetype = "glob", case_insensitive = TRUE, join = FALSE)),
list(text1 = c("a_b", "b_c_d", "e_f", "f_g"),
text2 = c("A_B", "B_C_D", "E_F", "F_G"))
)
expect_error(
tokens_compound(toks, seqs, join = c(TRUE, FALSE)),
"The length of join must be 1"
)
})
test_that("keep_unigrams works", {
txt <- "we like high quality sound"
toks <- tokens(txt)
# overlapped
pat <- phrase(c("high quality", "quality sound"))
expect_equal(as.list(tokens_compound(toks, pat, join = TRUE)),
list(text1 = c("we", "like", "high_quality_sound")))
expect_equal(as.list(tokens_compound(toks, pat, join = FALSE)),
list(text1 = c("we", "like", "high_quality", "quality_sound")))
expect_equal(as.list(tokens_compound(toks, pat, join = TRUE, keep_unigrams = TRUE)),
list(text1 = c("we", "like", "high", "quality", "sound",
"high_quality_sound")))
expect_equal(as.list(tokens_compound(toks, pat, join = FALSE, keep_unigrams = TRUE)),
list(text1 = c("we", "like", "high", "quality", "high_quality",
"sound", "quality_sound")))
# nested
pat2 <- phrase(c("high quality", "quality sound", "high quality sound"))
expect_equal(as.list(tokens_compound(toks, pat2, join = TRUE)),
list(text1 = c("we", "like", "high_quality_sound")))
expect_equal(as.list(tokens_compound(toks, pat2, join = FALSE)),
list(text1 = c("we", "like", "high_quality_sound")))
expect_equal(as.list(tokens_compound(toks, pat2, join = TRUE, keep_unigrams = TRUE)),
list(text1 = c("we", "like", "high", "quality", "sound",
"high_quality_sound")))
expect_equal(as.list(tokens_compound(toks, pat2, join = FALSE, keep_unigrams = TRUE)),
list(text1 = c("we", "like", "high", "quality", "sound",
"high_quality_sound")))
expect_error(
tokens_compound(toks, seqs, keep_unigrams = c(TRUE, FALSE)),
"The length of keep_unigrams must be 1"
)
})
test_that("tokens_compound is not affected by the order of compounds", {
expect_equal(
as.list(tokens_compound(tokens("The people of the United States of America."),
c("United States of America", "United States"))),
as.list(tokens_compound(tokens("The people of the United States of America."),
c("United States", "United States of America")))
)
})
test_that("tokens_compound preserved document names", {
expect_equal(
names(tokens_compound(tokens(c(d1 = "The people of the United States of America.",
d2 = "The United States is south of Canada.")),
c("United States", "United States of America"))),
c("d1", "d2")
)
})
test_that("tokens_compound works with padded tokens", {
toks <- tokens(c(doc1 = "a b c d e f g"))
toks <- tokens_remove(toks, c("b", "e"), padding = TRUE)
toks <- tokens_compound(toks, phrase("c d"))
expect_equal(sort(attr(toks, "types")),
sort(c("a", "c_d", "f", "g")))
})
test_that("tokens_compound works with different concatenators", {
toks <- tokens(c(doc1 = "a b c d e f g"))
toks1 <- tokens_compound(toks, phrase("c d"), concatenator = "+")
expect_equal(sort(attr(toks1, "types")),
sort(c("a", "b", "c+d", "e", "f", "g")))
expect_equal(meta(toks1, field = "concatenator", type = "object"), "+")
toks2 <- tokens_compound(toks, phrase("c d"), concatenator = "&&")
expect_equal(meta(toks2, field = "concatenator", type = "object"), "&&")
expect_equal(sort(attr(toks2, "types")),
sort(c("a", "b", "c&&d", "e", "f", "g")))
toks3 <- tokens_compound(toks, phrase("c d"), concatenator = "")
expect_equal(meta(toks3, field = "concatenator", type = "object"), "")
expect_equal(sort(attr(toks3, "types")),
sort(c("a", "b", "cd", "e", "f", "g")))
expect_error(tokens_compound(toks, phrase("c d"), concatenator = character()),
"The length of concatenator must be 1")
# update concatenator even without matches
toks4 <- tokens_compound(toks, phrase("xxxx yyy"), concatenator = "++")
expect_equal(sort(attr(toks4, "types")),
sort(c("a", "b", "c", "d", "e", "f", "g")))
expect_equal(meta(toks4, field = "concatenator", type = "object"), "++")
})
test_that("tokens_compound works with nested tokens", {
toks <- tokens("a b c d")
pat <- phrase(c("a b", "b c", "a b c"))
expect_equal(
as.character(tokens_compound(toks, pat, join = FALSE)),
c("a_b_c", "d")
)
expect_equal(
as.character(tokens_compound(toks, pat, join = TRUE)),
c("a_b_c", "d")
)
})
test_that("tokens_compound works with nested and overlapping tokens", {
toks <- tokens("a b c d e")
pat <- phrase(c("a b", "a b c", "c d"))
expect_equal(
as.character(tokens_compound(toks, pat, join = FALSE)),
c("a_b_c", "c_d", "e")
)
expect_equal(
as.character(tokens_compound(toks, pat, join = TRUE)),
c("a_b_c_d", "e")
)
})
test_that("tokens_compound works with dictionaries", {
dict <- dictionary(list(taxcgt = c("capital gains tax*"), taxit = "inheritance tax*"))
toks <- tokens("The new law included capital gains taxes and inheritance taxes.")
expect_equal(
as.character(tokens_compound(toks, dict))[c(5, 7)],
c("capital_gains_taxes", "inheritance_taxes")
)
expect_equal(
tokens_compound(toks, dict),
tokens_compound(toks, phrase(dict))
)
dict <- dictionary(list(tax1 = c("capital gains", "taxes"),
tax2 = "gains taxes"))
expect_equal(
as.character(tokens_compound(toks, dict, join = TRUE))[5],
c("capital_gains_taxes")
)
expect_equal(
as.character(tokens_compound(toks, dict, join = FALSE))[5:6],
c("capital_gains", "gains_taxes")
)
})
test_that("tokens_compound error when dfm is given, #1006", {
toks <- tokens("a b c")
expect_error(tokens_compound(toks, dfm(tokens("b c d"))))
})
test_that("tokens_compound window is working", {
txt <- c("a b c d e f g")
toks <- tokens(txt)
pat <- phrase(c("a b", "c d", "g"))
expect_equal(
as.list(tokens_compound(toks, pat, join = TRUE)),
as.list(tokens(c("a_b c_d e f g")))
)
expect_equal(
as.list(tokens_compound(toks, pat, join = FALSE)),
as.list(tokens(c("a_b c_d e f g")))
)
expect_equal(
as.list(tokens_compound(toks, pat, join = TRUE, window = 1)),
as.list(tokens(c("a_b_c_d_e f_g")))
)
expect_equal(
as.list(tokens_compound(toks, pat, join = FALSE, window = 1)),
as.list(tokens(c("a_b_c b_c_d_e f_g")))
)
expect_equal(
as.list(tokens_compound(toks, pat, join = TRUE, window = c(1, 0))),
as.list(tokens(c("a_b_c_d e f_g")))
)
expect_equal(
as.list(tokens_compound(toks, pat, join = FALSE, window = c(1, 0))),
as.list(tokens(c("a_b b_c_d e f_g")))
)
expect_equal(
as.list(tokens_compound(toks, pat, join = TRUE, window = c(0, 1))),
as.list(tokens(c("a_b_c_d_e f g")))
)
expect_equal(
as.list(tokens_compound(toks, pat, join = FALSE, window = c(0, 1))),
as.list(tokens(c("a_b_c c_d_e f g")))
)
expect_equal(
as.list(tokens_compound(toks, pat, join = TRUE, window = c(100, 100))),
as.list(tokens(c("a_b_c_d_e_f_g")))
)
expect_error(
tokens_compound(toks, pat, join = FALSE, window = -1),
"The value of window must be between 0 and Inf"
)
expect_error(
tokens_compound(toks, pat, join = FALSE, window = c(1, 1, 2)),
"The length of window must be between 1 and 2"
)
expect_equal(
as.list(tokens_compound(tokens_remove(toks, "a", padding = TRUE), pat, join = TRUE, window = 1)),
list(text1 = c("", "b_c_d_e", "f_g"))
)
expect_equal(
as.list(tokens_compound(tokens_remove(toks, "a", padding = TRUE), pat, join = FALSE, window = 1)),
list(text1 = c("", "b_c_d_e", "f_g"))
)
toks_pad1 <- tokens_remove(toks, "e", padding = TRUE)
expect_equal(
as.list(tokens_compound(toks_pad1, pat, join = TRUE, window = 1)),
list(text1 = c("a_b_c_d", "", "f_g"))
)
expect_equal(
as.list(tokens_compound(toks_pad1, pat, join = FALSE, window = 1)),
list(text1 = c("a_b_c", "b_c_d", "", "f_g"))
)
expect_equal(
as.list(tokens_compound(toks_pad1, pat, join = TRUE, window = 2)),
list(text1 = c("a_b_c_d", "", "f_g"))
)
expect_equal(
as.list(tokens_compound(toks_pad1, pat, join = FALSE, window = 2)),
list(text1 = c("a_b_c_d", "", "f_g"))
)
toks_pad2 <- tokens_remove(toks, c("a", "e"), padding = TRUE)
expect_equal(
as.list(tokens_compound(toks_pad2, pat, join = TRUE, window = 1)),
list(text1 = c("", "b_c_d", "", "f_g"))
)
expect_equal(
as.list(tokens_compound(toks_pad2, pat, join = FALSE, window = 1)),
list(text1 = c("", "b_c_d", "", "f_g"))
)
})
test_that("tokens_compound ignores padding", {
toks <- tokens("a b c d e")
toks <- tokens_remove(toks, "b", padding = TRUE)
toks_comp1 <- tokens_compound(toks, list(c("", "c"), c("d", "e")))
expect_equal(
as.character(toks_comp1),
c("a", "", "c", "d_e")
)
col <- data.frame(collocation = c(" c", "d e"))
class(col) <- c("collocations", "textstat", "data.frame")
toks_comp2 <- tokens_compound(toks, col)
expect_equal(
as.character(toks_comp2),
c("a", "", "c", "d_e")
)
})
test_that("apply_if argument is working", {
dat <- data.frame(text = c("C++ is a language",
"+++ Section C +++"),
topic = c("language", "separator"))
corp <- corpus(dat)
toks <- tokens(corp)
toks1 <- tokens_compound(toks, phrase("+ +"), concatenator = "",
apply_if = toks$topic == "language")
expect_identical(
as.list(toks1),
list(text1 = c("C", "++", "is", "a", "language"),
text2 = c("+", "+", "+", "Section", "C", "+", "+", "+"))
)
toks2 <- tokens_compound(toks, "c", window = c(0, 2), concatenator = "",
apply_if = toks$topic == "language") %>%
tokens_select(min_nchar = 3)
expect_identical(
as.list(toks2),
list(text1 = c("C++", "language"),
text2 = c("Section"))
)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.