tests/testthat/test-tokens_compound.R

test_that("tokens_compound join tokens correctly", {

    txt <- c("a b c d e f g", "A B C D E F G", "A b C d E f G",
             "aaa bbb ccc ddd eee fff ggg", "a_b b_c c_d d_e e_f f_g")
    toks <- tokens(txt)
    seqs <- as.list(tokens(c("a b", "C D", "aa* bb*", "eEE FFf", "d_e e_f"),
                    what = "fastestword"))
    expect_equivalent(
        as.list(tokens_compound(toks, seqs, valuetype = "glob", case_insensitive = TRUE)),
        list(c("a_b", "c_d", "e", "f", "g"),
             c("A_B", "C_D", "E", "F", "G"),
             c("A_b", "C_d", "E", "f", "G"),
             c("aaa_bbb", "ccc", "ddd", "eee_fff", "ggg"),
             c("a_b", "b_c", "c_d", "d_e_e_f", "f_g"))
    )

    expect_equivalent(
        as.list(tokens_compound(toks, seqs, valuetype = "glob", case_insensitive = FALSE)),
        list(c("a_b", "c", "d", "e", "f", "g"),
             c("A", "B", "C_D", "E", "F", "G"),
             c("A", "b", "C", "d", "E", "f", "G"),
             c("aaa_bbb", "ccc", "ddd", "eee", "fff", "ggg"),
             c("a_b", "b_c", "c_d", "d_e_e_f", "f_g"))
    )

    seqs_fixed <- as.list(tokens(c("a b", "C D", "aa bb", "eEE FFf", "d_e e_f"),
                          what = "fastestword"))
    expect_equivalent(
        as.list(tokens_compound(toks, seqs_fixed, valuetype = "glob", case_insensitive = TRUE)),
        list(c("a_b", "c_d", "e", "f", "g"),
             c("A_B", "C_D", "E", "F", "G"),
             c("A_b", "C_d", "E", "f", "G"),
             c("aaa", "bbb", "ccc", "ddd", "eee_fff", "ggg"),
             c("a_b", "b_c", "c_d", "d_e_e_f", "f_g"))
    )

    expect_equivalent(
        as.list(tokens_compound(toks, seqs_fixed, valuetype = "glob", case_insensitive = FALSE)),
        list(c("a_b", "c", "d", "e", "f", "g"),
             c("A", "B", "C_D", "E", "F", "G"),
             c("A", "b", "C", "d", "E", "f", "G"),
             c("aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg"),
             c("a_b", "b_c", "c_d", "d_e_e_f", "f_g"))
    )
  
})

test_that("tokens_compound join a sequences of sequences", {

    txt <- c("a b c d e f g", "A B C D E F G")
    toks <- tokens(txt)
    seqs <- as.list(tokens(c("a b", "b c d", "E F", "F G"),
                    what = "fastestword"))
    expect_equal(
        as.list(tokens_compound(toks, seqs, valuetype = "glob", case_insensitive = TRUE, join = TRUE)),
        list(text1 = c("a_b_c_d", "e_f_g"),
             text2 = c("A_B_C_D", "E_F_G"))
    )

    expect_equal(
        as.list(tokens_compound(toks, seqs, valuetype = "glob", case_insensitive = TRUE, join = FALSE)),
        list(text1 = c("a_b", "b_c_d", "e_f", "f_g"),
             text2 = c("A_B", "B_C_D", "E_F", "F_G"))
    )

    txts <- "we like high quality sound"
    seqs <- phrase(c("high quality", "quality sound"))
    expect_equal(as.list(tokens_compound(tokens(txts), seqs, join = TRUE)),
                      list(text1 = c("we", "like", "high_quality_sound")))
    expect_equal(as.list(tokens_compound(tokens(txts), seqs, join = FALSE)),
                      list(text1 = c("we", "like", "high_quality", "quality_sound")))

})

test_that("tokens_compound is not affected by the order of compounds", {
    expect_equal(
        as.list(tokens_compound(tokens("The people of the United States of America."),
                                c("United States of America", "United States"))),
        as.list(tokens_compound(tokens("The people of the United States of America."),
                                c("United States", "United States of America")))
    )
})

test_that("tokens_compound preserved document names", {
    expect_equal(
        names(tokens_compound(tokens(c(d1 = "The people of the United States of America.",
                                       d2 = "The United States is south of Canada.")),
                              c("United States", "United States of America"))),
        c("d1", "d2")
    )
})

test_that("tokens_compound works with padded tokens", {
    toks <- tokens(c(doc1 = "a b c d e f g"))
    toks <- tokens_remove(toks, c("b", "e"), padding = TRUE)
    toks <- tokens_compound(toks, phrase("c d"))
    expect_equal(sort(attr(toks, "types")),
                 sort(c("a", "c_d", "f", "g")))
})

test_that("tokens_compound works with different concatenators", {
  toks <- tokens(c(doc1 = "a b c d e f g"))
  toks1 <- tokens_compound(toks, phrase("c d"), concatenator = "+")
  expect_equal(sort(attr(toks1, "types")),
               sort(c("a", "b", "c+d", "e", "f", "g")))
  expect_equal(meta(toks1, field = "concatenator", type = "object"),
               "+")
  toks2 <- tokens_compound(toks, phrase("c d"), concatenator = "&&")
  expect_equal(meta(toks2, field = "concatenator", type = "object"),
               "&&")
  expect_equal(sort(attr(toks2, "types")),
               sort(c("a", "b", "c&&d", "e", "f", "g")))
  toks3 <- tokens_compound(toks, phrase("c d"), concatenator = "")
  expect_equal(meta(toks3, field = "concatenator", type = "object"),
               "")
  expect_equal(sort(attr(toks3, "types")),
               sort(c("a", "b", "cd", "e", "f", "g")))
  expect_error(tokens_compound(toks, phrase("c d"), concatenator = character()),
               "The length of concatenator must be 1")
})

test_that("tokens_compound works as expected with nested tokens", {

    expect_equal(
        as.character(tokens_compound(tokens("a b c d"), phrase(c("a b", "a b c")),
                     join = FALSE)),
        c("a_b_c", "d")
    )
    expect_equal(
        as.character(tokens_compound(tokens("a b c d"), phrase(c("a b", "a b c")),
                     join = TRUE)),
        c("a_b_c", "d")
    )
})

test_that("tokens_compound works as expected with nested and overlapping tokens", {

    expect_equal(
        as.character(tokens_compound(tokens("a b c d e"),
                                     phrase(c("a b", "a b c", "c d")),
                                     join = FALSE)),
        c("a_b_c", "c_d", "e")
    )
    expect_equal(
        as.character(tokens_compound(tokens("a b c d e"),
                                     phrase(c("a b", "a b c", "c d")),
                                     join = TRUE)),
        c("a_b_c_d", "e")
    )
})

test_that("tokens_compound works as expected with dictionaries", {
    dict <- dictionary(list(taxcgt = c("capital gains tax*"), taxit = "inheritance tax*"))
    toks <- tokens("The new law included capital gains taxes and inheritance taxes.")
    expect_equal(
        as.character(tokens_compound(toks, dict))[c(5, 7)],
        c("capital_gains_taxes", "inheritance_taxes")
    )
    expect_equal(
        tokens_compound(toks, dict),
        tokens_compound(toks, phrase(dict))
    )

    dict <- dictionary(list(tax1 = c("capital gains", "taxes"),
                            tax2 = "gains taxes"))
    expect_equal(
        as.character(tokens_compound(toks, dict, join = TRUE))[5],
        c("capital_gains_taxes")
    )
    expect_equal(
        as.character(tokens_compound(toks, dict, join = FALSE))[5:6],
        c("capital_gains", "gains_taxes")
    )
})

test_that("tokens_compound error when dfm is given, #1006", {
    toks <- tokens("a b c")
    expect_error(tokens_compound(toks, dfm(tokens("b c d"))))
})

test_that("tokens_compound window is working", {

  txt <- c("a b c d e f g")
  toks <- tokens(txt)
  pat <- phrase(c("a b", "c d", "g"))

  expect_equal(
    as.list(tokens_compound(toks, pat, join = TRUE)),
    as.list(tokens(c("a_b c_d e f g")))
  )
  expect_equal(
    as.list(tokens_compound(toks, pat, join = FALSE)),
    as.list(tokens(c("a_b c_d e f g")))
  )

  expect_equal(
    as.list(tokens_compound(toks, pat, join = TRUE, window = 1)),
    as.list(tokens(c("a_b_c_d_e f_g")))
  )
  expect_equal(
    as.list(tokens_compound(toks, pat, join = FALSE, window = 1)),
    as.list(tokens(c("a_b_c b_c_d_e f_g")))
  )

  expect_equal(
    as.list(tokens_compound(toks, pat, join = TRUE, window = c(1, 0))),
    as.list(tokens(c("a_b_c_d e f_g")))
  )
  expect_equal(
    as.list(tokens_compound(toks, pat, join = FALSE, window = c(1, 0))),
    as.list(tokens(c("a_b b_c_d e f_g")))
  )

  expect_equal(
    as.list(tokens_compound(toks, pat, join = TRUE, window = c(0, 1))),
    as.list(tokens(c("a_b_c_d_e f g")))
  )
  expect_equal(
    as.list(tokens_compound(toks, pat, join = FALSE, window = c(0, 1))),
    as.list(tokens(c("a_b_c c_d_e f g")))
  )

  expect_equal(
    as.list(tokens_compound(toks, pat, join = TRUE, window = c(100, 100))),
    as.list(tokens(c("a_b_c_d_e_f_g")))
  )
  expect_error(
    tokens_compound(toks, pat, join = FALSE, window = -1),
    "The value of window must be between 0 and Inf"
  )
  expect_error(
    tokens_compound(toks, pat, join = FALSE, window = c(1, 1, 2)),
    "The length of window must be between 1 and 2"
  )
  expect_equal(
    as.list(tokens_compound(tokens_remove(toks, "a", padding = TRUE), pat, join = TRUE, window = 1)),
    list(text1 = c("", "b_c_d_e", "f_g"))
  )
  expect_equal(
    as.list(tokens_compound(tokens_remove(toks, "a", padding = TRUE), pat, join = FALSE, window = 1)),
    list(text1 = c("", "b_c_d_e", "f_g"))
  )
  toks_pad1 <- tokens_remove(toks, "e", padding = TRUE)
  expect_equal(
    as.list(tokens_compound(toks_pad1, pat, join = TRUE, window = 1)),
    list(text1 = c("a_b_c_d", "", "f_g"))
  )
  expect_equal(
    as.list(tokens_compound(toks_pad1, pat, join = FALSE, window = 1)),
    list(text1 = c("a_b_c", "b_c_d", "", "f_g"))
  )

  expect_equal(
    as.list(tokens_compound(toks_pad1, pat, join = TRUE, window = 2)),
    list(text1 = c("a_b_c_d", "", "f_g"))
  )
  expect_equal(
    as.list(tokens_compound(toks_pad1, pat, join = FALSE, window = 2)),
    list(text1 = c("a_b_c_d", "a_b_c_d", "", "f_g"))
  )

  toks_pad2 <- tokens_remove(toks, c("a", "e"), padding = TRUE)
  expect_equal(
    as.list(tokens_compound(toks_pad2, pat, join = TRUE, window = 1)),
    list(text1 = c("", "b_c_d", "", "f_g"))
  )
  expect_equal(
    as.list(tokens_compound(toks_pad2, pat, join = FALSE, window = 1)),
    list(text1 = c("", "b_c_d", "", "f_g"))
  )

})


test_that("tokens_compound ignores padding", {
    
    toks <- tokens("a b c d e")
    toks <- tokens_remove(toks, "b", padding = TRUE)
    
    toks_comp1 <- tokens_compound(toks, list(c("", "c"), c("d", "e")))
    expect_equal(
        as.character(toks_comp1),
        c("a", "", "c", "d_e")
    )
    
    col <- data.frame(collocation = c(" c", "d e"))
    class(col) <- c("collocations", "textstat", "data.frame")
    toks_comp2 <- tokens_compound(toks, col)
    expect_equal(
        as.character(toks_comp2),
        c("a", "", "c", "d_e")
    )
})

Try the quanteda package in your browser

Any scripts or data that you put into this service are public.

quanteda documentation built on May 31, 2023, 8:28 p.m.